第三方XML解析器(xpath.js)给出错误“未被捕获的结束标记名称:div与当前的起始标记名称不匹配”
问题描述:
使用parse.com的云代码时,我试图从网页上抓取数据以发送给我iOS应用我已经在iOS中本地实现了网络抓取代码,但我试图将此任务移至后端。我使用的是被称为第三方XML解析器(xpath.js)给出错误“未被捕获的结束标记名称:div与当前的起始标记名称不匹配”
Parse.Cloud.define("test", function(request, response) {
Parse.Cloud.httpRequest({
url: "http://menu.ha.ucla.edu/foodpro/default.asp",
success: function(httpResponse) {
var text = httpResponse.text;
var xpath = require("cloud/xpath.js"), dom = require("cloud/dom-parser.js").DOMParser;
var doc = new dom().parseFromString(text);
var cells = xpath.select("//td[starts-with(@class, 'menugridcell')]", doc);
response.success("test " + cells.count);
var listNode = xpath.select("//ul", cells[0])[0];
},
error: function(httpResponse) {
console.error('Request failed with response code ' + httpResponse.status);
}
});
});
但是一个node.js的图书馆,当我运行的代码,我收到此错误:
"Uncaught end tag name: div is not match the current start tagName:script"
就像我前面提到的,我已经能够成功地刮Web数据与一个单独的Objective-C库,所以标签是一致的,问题不在于源代码内。
对于源代码,这里是webpage I'm scraping。 StackOverflow不会让我直接链接到源代码,否则我会给一个直接的链接。
编辑:
这里是DOM-parser.js
function DOMParser(options){
this.options = options ||{locator:{}};
}
DOMParser.prototype.parseFromString = function(source,mimeType){
var options = this.options;
var sax = new XMLReader();
var domBuilder = options.domBuilder || new DOMHandler();//contentHandler and LexicalHandler
var errorHandler = options.errorHandler;
var locator = options.locator;
var defaultNSMap = options.xmlns||{};
var entityMap = {'lt':'<','gt':'>','amp':'&','quot':'"','apos':"'"}
if(locator){
domBuilder.setDocumentLocator(locator)
}
sax.errorHandler = buildErrorHandler(errorHandler,domBuilder,locator);
sax.domBuilder = options.domBuilder || domBuilder;
if(/\/x?html?$/.test(mimeType)){
entityMap.nbsp = '\xa0';
entityMap.copy = '\xa9';
defaultNSMap['']= 'http://www.w3.org/1999/xhtml';
}
if(source){
sax.parse(source,defaultNSMap,entityMap);
}else{
sax.errorHandler.error("invalid document source");
}
return domBuilder.document;
}
function buildErrorHandler(errorImpl,domBuilder,locator){
if(!errorImpl){
if(domBuilder instanceof DOMHandler){
return domBuilder;
}
errorImpl = domBuilder ;
}
var errorHandler = {}
var isCallback = errorImpl instanceof Function;
locator = locator||{}
function build(key){
var fn = errorImpl[key];
if(!fn){
if(isCallback){
fn = errorImpl.length == 2?function(msg){errorImpl(key,msg)}:errorImpl;
}else{
var i=arguments.length;
while(--i){
if(fn = errorImpl[arguments[i]]){
break;
}
}
}
}
errorHandler[key] = fn && function(msg){
fn(msg+_locator(locator));
}||function(){};
}
build('warning','warn');
build('error','warn','warning');
build('fatalError','warn','warning','error');
return errorHandler;
}
/**
* +ContentHandler+ErrorHandler
* +LexicalHandler+EntityResolver2
* -DeclHandler-DTDHandler
*
* DefaultHandler:EntityResolver, DTDHandler, ContentHandler, ErrorHandler
* DefaultHandler2:DefaultHandler,LexicalHandler, DeclHandler, EntityResolver2
* @link http://www.saxproject.org/apidoc/org/xml/sax/helpers/DefaultHandler.html
*/
function DOMHandler() {
this.cdata = false;
}
function position(locator,node){
node.lineNumber = locator.lineNumber;
node.columnNumber = locator.columnNumber;
}
/**
* @see org.xml.sax.ContentHandler#startDocument
* @link http://www.saxproject.org/apidoc/org/xml/sax/ContentHandler.html
*/
DOMHandler.prototype = {
startDocument : function() {
this.document = new DOMImplementation().createDocument(null, null, null);
if (this.locator) {
this.document.documentURI = this.locator.systemId;
}
},
startElement:function(namespaceURI, localName, qName, attrs) {
var doc = this.document;
var el = doc.createElementNS(namespaceURI, qName||localName);
var len = attrs.length;
appendElement(this, el);
this.currentElement = el;
this.locator && position(this.locator,el)
for (var i = 0 ; i < len; i++) {
var namespaceURI = attrs.getURI(i);
var value = attrs.getValue(i);
var qName = attrs.getQName(i);
var attr = doc.createAttributeNS(namespaceURI, qName);
if(attr.getOffset){
position(attr.getOffset(1),attr)
}
attr.value = attr.nodeValue = value;
el.setAttributeNode(attr)
}
},
endElement:function(namespaceURI, localName, qName) {
var current = this.currentElement
var tagName = current.tagName;
this.currentElement = current.parentNode;
},
startPrefixMapping:function(prefix, uri) {
},
endPrefixMapping:function(prefix) {
},
processingInstruction:function(target, data) {
var ins = this.document.createProcessingInstruction(target, data);
this.locator && position(this.locator,ins)
appendElement(this, ins);
},
ignorableWhitespace:function(ch, start, length) {
},
characters:function(chars, start, length) {
chars = _toString.apply(this,arguments)
//console.log(chars)
if(this.currentElement && chars){
if (this.cdata) {
var charNode = this.document.createCDATASection(chars);
this.currentElement.appendChild(charNode);
} else {
var charNode = this.document.createTextNode(chars);
this.currentElement.appendChild(charNode);
}
this.locator && position(this.locator,charNode)
}
},
skippedEntity:function(name) {
},
endDocument:function() {
this.document.normalize();
},
setDocumentLocator:function (locator) {
if(this.locator = locator){// && !('lineNumber' in locator)){
locator.lineNumber = 0;
}
},
//LexicalHandler
comment:function(chars, start, length) {
chars = _toString.apply(this,arguments)
var comm = this.document.createComment(chars);
this.locator && position(this.locator,comm)
appendElement(this, comm);
},
startCDATA:function() {
//used in characters() methods
this.cdata = true;
},
endCDATA:function() {
this.cdata = false;
},
startDTD:function(name, publicId, systemId) {
var impl = this.document.implementation;
if (impl && impl.createDocumentType) {
var dt = impl.createDocumentType(name, publicId, systemId);
this.locator && position(this.locator,dt)
appendElement(this, dt);
}
},
/**
* @see org.xml.sax.ErrorHandler
* @link http://www.saxproject.org/apidoc/org/xml/sax/ErrorHandler.html
*/
warning:function(error) {
console.warn(error,_locator(this.locator));
},
error:function(error) {
console.error(error,_locator(this.locator));
},
fatalError:function(error) {
console.error(error,_locator(this.locator));
throw error;
}
}
function _locator(l){
if(l){
return '\[email protected]'+(l.systemId ||'')+'#[line:'+l.lineNumber+',col:'+l.columnNumber+']'
}
}
function _toString(chars,start,length){
if(typeof chars == 'string'){
return chars.substr(start,length)
}else{//java sax connect width xmldom on rhino(what about: "? && !(chars instanceof String)")
if(chars.length >= start+length || start){
return new java.lang.String(chars,start,length)+'';
}
return chars;
}
}
/*
* @link http://www.saxproject.org/apidoc/org/xml/sax/ext/LexicalHandler.html
* used method of org.xml.sax.ext.LexicalHandler:
* #comment(chars, start, length)
* #startCDATA()
* #endCDATA()
* #startDTD(name, publicId, systemId)
*
*
* IGNORED method of org.xml.sax.ext.LexicalHandler:
* #endDTD()
* #startEntity(name)
* #endEntity(name)
*
*
* @link http://www.saxproject.org/apidoc/org/xml/sax/ext/DeclHandler.html
* IGNORED method of org.xml.sax.ext.DeclHandler
* #attributeDecl(eName, aName, type, mode, value)
* #elementDecl(name, model)
* #externalEntityDecl(name, publicId, systemId)
* #internalEntityDecl(name, value)
* @link http://www.saxproject.org/apidoc/org/xml/sax/ext/EntityResolver2.html
* IGNORED method of org.xml.sax.EntityResolver2
* #resolveEntity(String name,String publicId,String baseURI,String systemId)
* #resolveEntity(publicId, systemId)
* #getExternalSubset(name, baseURI)
* @link http://www.saxproject.org/apidoc/org/xml/sax/DTDHandler.html
* IGNORED method of org.xml.sax.DTDHandler
* #notationDecl(name, publicId, systemId) {};
* #unparsedEntityDecl(name, publicId, systemId, notationName) {};
*/
"endDTD,startEntity,endEntity,attributeDecl,elementDecl,externalEntityDecl,internalEntityDecl,resolveEntity,getExternalSubset,notationDecl,unparsedEntityDecl".replace(/\w+/g,function(key){
DOMHandler.prototype[key] = function(){return null}
})
/* Private static helpers treated below as private instance methods, so don't need to add these to the public API; we might use a Relator to also get rid of non-standard public properties */
function appendElement (hander,node) {
if (!hander.currentElement) {
hander.document.appendChild(node);
} else {
hander.currentElement.appendChild(node);
}
}//appendChild and setAttributeNS are preformance key
if(typeof require == 'function'){
var XMLReader = require('cloud/sax').XMLReader;
var DOMImplementation = exports.DOMImplementation = require('cloud/dom').DOMImplementation;
exports.XMLSerializer = require('cloud/dom').XMLSerializer ;
exports.DOMParser = DOMParser;
}
答
代码给定的页面包含在HTML脚本一些XML标记。开放标签可能会被忽略,因为它们包含预先印好的qoutation标记。解析器找到</div>
(在脚本中的一个字符串中)并尝试使其与开头<script>
匹配并失败。您的解析器尝试读取XML,但不知道xhtml脚本区域是CData。
您必须告诉解析忽略(或读取CData)脚本标记。对不起,但我不知道该怎么办。
最好的问候Majo
在这种情况下,什么库是'cloud/dom-parser.js'?它可能是解析为XML而不是HTML。如果你做'.parseFromString(text,'text/html');'? – loganfsmyth 2014-09-27 17:59:03
否 - 相同的错误。我不知道什么库dom-parser.js是,但我将文件中的代码添加到问题 – Mahir 2014-09-28 06:58:19
您可以创建一个jsfiddle来演示此问题吗? – 2014-10-08 21:59:27