Parser.js 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350
  1. var Tokenizer = require("./Tokenizer.js");
  2. /*
  3. Options:
  4. xmlMode: Disables the special behavior for script/style tags (false by default)
  5. lowerCaseAttributeNames: call .toLowerCase for each attribute name (true if xmlMode is `false`)
  6. lowerCaseTags: call .toLowerCase for each tag name (true if xmlMode is `false`)
  7. */
  8. /*
  9. Callbacks:
  10. oncdataend,
  11. oncdatastart,
  12. onclosetag,
  13. oncomment,
  14. oncommentend,
  15. onerror,
  16. onopentag,
  17. onprocessinginstruction,
  18. onreset,
  19. ontext
  20. */
  21. var formTags = {
  22. input: true,
  23. option: true,
  24. optgroup: true,
  25. select: true,
  26. button: true,
  27. datalist: true,
  28. textarea: true
  29. };
  30. var openImpliesClose = {
  31. tr : { tr:true, th:true, td:true },
  32. th : { th:true },
  33. td : { thead:true, th:true, td:true },
  34. body : { head:true, link:true, script:true },
  35. li : { li:true },
  36. p : { p:true },
  37. h1 : { p:true },
  38. h2 : { p:true },
  39. h3 : { p:true },
  40. h4 : { p:true },
  41. h5 : { p:true },
  42. h6 : { p:true },
  43. select : formTags,
  44. input : formTags,
  45. output : formTags,
  46. button : formTags,
  47. datalist: formTags,
  48. textarea: formTags,
  49. option : { option:true },
  50. optgroup: { optgroup:true }
  51. };
  52. var voidElements = {
  53. __proto__: null,
  54. area: true,
  55. base: true,
  56. basefont: true,
  57. br: true,
  58. col: true,
  59. command: true,
  60. embed: true,
  61. frame: true,
  62. hr: true,
  63. img: true,
  64. input: true,
  65. isindex: true,
  66. keygen: true,
  67. link: true,
  68. meta: true,
  69. param: true,
  70. source: true,
  71. track: true,
  72. wbr: true,
  73. //common self closing svg elements
  74. path: true,
  75. circle: true,
  76. ellipse: true,
  77. line: true,
  78. rect: true,
  79. use: true,
  80. stop: true,
  81. polyline: true,
  82. polygon: true
  83. };
  84. var re_nameEnd = /\s|\//;
  85. function Parser(cbs, options){
  86. this._options = options || {};
  87. this._cbs = cbs || {};
  88. this._tagname = "";
  89. this._attribname = "";
  90. this._attribvalue = "";
  91. this._attribs = null;
  92. this._stack = [];
  93. this.startIndex = 0;
  94. this.endIndex = null;
  95. this._lowerCaseTagNames = "lowerCaseTags" in this._options ?
  96. !!this._options.lowerCaseTags :
  97. !this._options.xmlMode;
  98. this._lowerCaseAttributeNames = "lowerCaseAttributeNames" in this._options ?
  99. !!this._options.lowerCaseAttributeNames :
  100. !this._options.xmlMode;
  101. this._tokenizer = new Tokenizer(this._options, this);
  102. if(this._cbs.onparserinit) this._cbs.onparserinit(this);
  103. }
  104. require("util").inherits(Parser, require("events").EventEmitter);
  105. Parser.prototype._updatePosition = function(initialOffset){
  106. if(this.endIndex === null){
  107. if(this._tokenizer._sectionStart <= initialOffset){
  108. this.startIndex = 0;
  109. } else {
  110. this.startIndex = this._tokenizer._sectionStart - initialOffset;
  111. }
  112. }
  113. else this.startIndex = this.endIndex + 1;
  114. this.endIndex = this._tokenizer.getAbsoluteIndex();
  115. };
  116. //Tokenizer event handlers
  117. Parser.prototype.ontext = function(data){
  118. this._updatePosition(1);
  119. this.endIndex--;
  120. if(this._cbs.ontext) this._cbs.ontext(data);
  121. };
  122. Parser.prototype.onopentagname = function(name){
  123. if(this._lowerCaseTagNames){
  124. name = name.toLowerCase();
  125. }
  126. this._tagname = name;
  127. if(!this._options.xmlMode && name in openImpliesClose) {
  128. for(
  129. var el;
  130. (el = this._stack[this._stack.length - 1]) in openImpliesClose[name];
  131. this.onclosetag(el)
  132. );
  133. }
  134. if(this._options.xmlMode || !(name in voidElements)){
  135. this._stack.push(name);
  136. }
  137. if(this._cbs.onopentagname) this._cbs.onopentagname(name);
  138. if(this._cbs.onopentag) this._attribs = {};
  139. };
  140. Parser.prototype.onopentagend = function(){
  141. this._updatePosition(1);
  142. if(this._attribs){
  143. if(this._cbs.onopentag) this._cbs.onopentag(this._tagname, this._attribs);
  144. this._attribs = null;
  145. }
  146. if(!this._options.xmlMode && this._cbs.onclosetag && this._tagname in voidElements){
  147. this._cbs.onclosetag(this._tagname);
  148. }
  149. this._tagname = "";
  150. };
  151. Parser.prototype.onclosetag = function(name){
  152. this._updatePosition(1);
  153. if(this._lowerCaseTagNames){
  154. name = name.toLowerCase();
  155. }
  156. if(this._stack.length && (!(name in voidElements) || this._options.xmlMode)){
  157. var pos = this._stack.lastIndexOf(name);
  158. if(pos !== -1){
  159. if(this._cbs.onclosetag){
  160. pos = this._stack.length - pos;
  161. while(pos--) this._cbs.onclosetag(this._stack.pop());
  162. }
  163. else this._stack.length = pos;
  164. } else if(name === "p" && !this._options.xmlMode){
  165. this.onopentagname(name);
  166. this._closeCurrentTag();
  167. }
  168. } else if(!this._options.xmlMode && (name === "br" || name === "p")){
  169. this.onopentagname(name);
  170. this._closeCurrentTag();
  171. }
  172. };
  173. Parser.prototype.onselfclosingtag = function(){
  174. if(this._options.xmlMode || this._options.recognizeSelfClosing){
  175. this._closeCurrentTag();
  176. } else {
  177. this.onopentagend();
  178. }
  179. };
  180. Parser.prototype._closeCurrentTag = function(){
  181. var name = this._tagname;
  182. this.onopentagend();
  183. //self-closing tags will be on the top of the stack
  184. //(cheaper check than in onclosetag)
  185. if(this._stack[this._stack.length - 1] === name){
  186. if(this._cbs.onclosetag){
  187. this._cbs.onclosetag(name);
  188. }
  189. this._stack.pop();
  190. }
  191. };
  192. Parser.prototype.onattribname = function(name){
  193. if(this._lowerCaseAttributeNames){
  194. name = name.toLowerCase();
  195. }
  196. this._attribname = name;
  197. };
  198. Parser.prototype.onattribdata = function(value){
  199. this._attribvalue += value;
  200. };
  201. Parser.prototype.onattribend = function(){
  202. if(this._cbs.onattribute) this._cbs.onattribute(this._attribname, this._attribvalue);
  203. if(
  204. this._attribs &&
  205. !Object.prototype.hasOwnProperty.call(this._attribs, this._attribname)
  206. ){
  207. this._attribs[this._attribname] = this._attribvalue;
  208. }
  209. this._attribname = "";
  210. this._attribvalue = "";
  211. };
  212. Parser.prototype._getInstructionName = function(value){
  213. var idx = value.search(re_nameEnd),
  214. name = idx < 0 ? value : value.substr(0, idx);
  215. if(this._lowerCaseTagNames){
  216. name = name.toLowerCase();
  217. }
  218. return name;
  219. };
  220. Parser.prototype.ondeclaration = function(value){
  221. if(this._cbs.onprocessinginstruction){
  222. var name = this._getInstructionName(value);
  223. this._cbs.onprocessinginstruction("!" + name, "!" + value);
  224. }
  225. };
  226. Parser.prototype.onprocessinginstruction = function(value){
  227. if(this._cbs.onprocessinginstruction){
  228. var name = this._getInstructionName(value);
  229. this._cbs.onprocessinginstruction("?" + name, "?" + value);
  230. }
  231. };
  232. Parser.prototype.oncomment = function(value){
  233. this._updatePosition(4);
  234. if(this._cbs.oncomment) this._cbs.oncomment(value);
  235. if(this._cbs.oncommentend) this._cbs.oncommentend();
  236. };
  237. Parser.prototype.oncdata = function(value){
  238. this._updatePosition(1);
  239. if(this._options.xmlMode || this._options.recognizeCDATA){
  240. if(this._cbs.oncdatastart) this._cbs.oncdatastart();
  241. if(this._cbs.ontext) this._cbs.ontext(value);
  242. if(this._cbs.oncdataend) this._cbs.oncdataend();
  243. } else {
  244. this.oncomment("[CDATA[" + value + "]]");
  245. }
  246. };
  247. Parser.prototype.onerror = function(err){
  248. if(this._cbs.onerror) this._cbs.onerror(err);
  249. };
  250. Parser.prototype.onend = function(){
  251. if(this._cbs.onclosetag){
  252. for(
  253. var i = this._stack.length;
  254. i > 0;
  255. this._cbs.onclosetag(this._stack[--i])
  256. );
  257. }
  258. if(this._cbs.onend) this._cbs.onend();
  259. };
  260. //Resets the parser to a blank state, ready to parse a new HTML document
  261. Parser.prototype.reset = function(){
  262. if(this._cbs.onreset) this._cbs.onreset();
  263. this._tokenizer.reset();
  264. this._tagname = "";
  265. this._attribname = "";
  266. this._attribs = null;
  267. this._stack = [];
  268. if(this._cbs.onparserinit) this._cbs.onparserinit(this);
  269. };
  270. //Parses a complete HTML document and pushes it to the handler
  271. Parser.prototype.parseComplete = function(data){
  272. this.reset();
  273. this.end(data);
  274. };
  275. Parser.prototype.write = function(chunk){
  276. this._tokenizer.write(chunk);
  277. };
  278. Parser.prototype.end = function(chunk){
  279. this._tokenizer.end(chunk);
  280. };
  281. Parser.prototype.pause = function(){
  282. this._tokenizer.pause();
  283. };
  284. Parser.prototype.resume = function(){
  285. this._tokenizer.resume();
  286. };
  287. //alias for backwards compat
  288. Parser.prototype.parseChunk = Parser.prototype.write;
  289. Parser.prototype.done = Parser.prototype.end;
  290. module.exports = Parser;