Tokenizer.js 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906
  1. module.exports = Tokenizer;
  2. var decodeCodePoint = require("entities/lib/decode_codepoint.js"),
  3. entityMap = require("entities/maps/entities.json"),
  4. legacyMap = require("entities/maps/legacy.json"),
  5. xmlMap = require("entities/maps/xml.json"),
  6. i = 0,
  7. TEXT = i++,
  8. BEFORE_TAG_NAME = i++, //after <
  9. IN_TAG_NAME = i++,
  10. IN_SELF_CLOSING_TAG = i++,
  11. BEFORE_CLOSING_TAG_NAME = i++,
  12. IN_CLOSING_TAG_NAME = i++,
  13. AFTER_CLOSING_TAG_NAME = i++,
  14. //attributes
  15. BEFORE_ATTRIBUTE_NAME = i++,
  16. IN_ATTRIBUTE_NAME = i++,
  17. AFTER_ATTRIBUTE_NAME = i++,
  18. BEFORE_ATTRIBUTE_VALUE = i++,
  19. IN_ATTRIBUTE_VALUE_DQ = i++, // "
  20. IN_ATTRIBUTE_VALUE_SQ = i++, // '
  21. IN_ATTRIBUTE_VALUE_NQ = i++,
  22. //declarations
  23. BEFORE_DECLARATION = i++, // !
  24. IN_DECLARATION = i++,
  25. //processing instructions
  26. IN_PROCESSING_INSTRUCTION = i++, // ?
  27. //comments
  28. BEFORE_COMMENT = i++,
  29. IN_COMMENT = i++,
  30. AFTER_COMMENT_1 = i++,
  31. AFTER_COMMENT_2 = i++,
  32. //cdata
  33. BEFORE_CDATA_1 = i++, // [
  34. BEFORE_CDATA_2 = i++, // C
  35. BEFORE_CDATA_3 = i++, // D
  36. BEFORE_CDATA_4 = i++, // A
  37. BEFORE_CDATA_5 = i++, // T
  38. BEFORE_CDATA_6 = i++, // A
  39. IN_CDATA = i++, // [
  40. AFTER_CDATA_1 = i++, // ]
  41. AFTER_CDATA_2 = i++, // ]
  42. //special tags
  43. BEFORE_SPECIAL = i++, //S
  44. BEFORE_SPECIAL_END = i++, //S
  45. BEFORE_SCRIPT_1 = i++, //C
  46. BEFORE_SCRIPT_2 = i++, //R
  47. BEFORE_SCRIPT_3 = i++, //I
  48. BEFORE_SCRIPT_4 = i++, //P
  49. BEFORE_SCRIPT_5 = i++, //T
  50. AFTER_SCRIPT_1 = i++, //C
  51. AFTER_SCRIPT_2 = i++, //R
  52. AFTER_SCRIPT_3 = i++, //I
  53. AFTER_SCRIPT_4 = i++, //P
  54. AFTER_SCRIPT_5 = i++, //T
  55. BEFORE_STYLE_1 = i++, //T
  56. BEFORE_STYLE_2 = i++, //Y
  57. BEFORE_STYLE_3 = i++, //L
  58. BEFORE_STYLE_4 = i++, //E
  59. AFTER_STYLE_1 = i++, //T
  60. AFTER_STYLE_2 = i++, //Y
  61. AFTER_STYLE_3 = i++, //L
  62. AFTER_STYLE_4 = i++, //E
  63. BEFORE_ENTITY = i++, //&
  64. BEFORE_NUMERIC_ENTITY = i++, //#
  65. IN_NAMED_ENTITY = i++,
  66. IN_NUMERIC_ENTITY = i++,
  67. IN_HEX_ENTITY = i++, //X
  68. j = 0,
  69. SPECIAL_NONE = j++,
  70. SPECIAL_SCRIPT = j++,
  71. SPECIAL_STYLE = j++;
  72. function whitespace(c){
  73. return c === " " || c === "\n" || c === "\t" || c === "\f" || c === "\r";
  74. }
  75. function characterState(char, SUCCESS){
  76. return function(c){
  77. if(c === char) this._state = SUCCESS;
  78. };
  79. }
  80. function ifElseState(upper, SUCCESS, FAILURE){
  81. var lower = upper.toLowerCase();
  82. if(upper === lower){
  83. return function(c){
  84. if(c === lower){
  85. this._state = SUCCESS;
  86. } else {
  87. this._state = FAILURE;
  88. this._index--;
  89. }
  90. };
  91. } else {
  92. return function(c){
  93. if(c === lower || c === upper){
  94. this._state = SUCCESS;
  95. } else {
  96. this._state = FAILURE;
  97. this._index--;
  98. }
  99. };
  100. }
  101. }
  102. function consumeSpecialNameChar(upper, NEXT_STATE){
  103. var lower = upper.toLowerCase();
  104. return function(c){
  105. if(c === lower || c === upper){
  106. this._state = NEXT_STATE;
  107. } else {
  108. this._state = IN_TAG_NAME;
  109. this._index--; //consume the token again
  110. }
  111. };
  112. }
  113. function Tokenizer(options, cbs){
  114. this._state = TEXT;
  115. this._buffer = "";
  116. this._sectionStart = 0;
  117. this._index = 0;
  118. this._bufferOffset = 0; //chars removed from _buffer
  119. this._baseState = TEXT;
  120. this._special = SPECIAL_NONE;
  121. this._cbs = cbs;
  122. this._running = true;
  123. this._ended = false;
  124. this._xmlMode = !!(options && options.xmlMode);
  125. this._decodeEntities = !!(options && options.decodeEntities);
  126. }
  127. Tokenizer.prototype._stateText = function(c){
  128. if(c === "<"){
  129. if(this._index > this._sectionStart){
  130. this._cbs.ontext(this._getSection());
  131. }
  132. this._state = BEFORE_TAG_NAME;
  133. this._sectionStart = this._index;
  134. } else if(this._decodeEntities && this._special === SPECIAL_NONE && c === "&"){
  135. if(this._index > this._sectionStart){
  136. this._cbs.ontext(this._getSection());
  137. }
  138. this._baseState = TEXT;
  139. this._state = BEFORE_ENTITY;
  140. this._sectionStart = this._index;
  141. }
  142. };
  143. Tokenizer.prototype._stateBeforeTagName = function(c){
  144. if(c === "/"){
  145. this._state = BEFORE_CLOSING_TAG_NAME;
  146. } else if(c === ">" || this._special !== SPECIAL_NONE || whitespace(c)) {
  147. this._state = TEXT;
  148. } else if(c === "!"){
  149. this._state = BEFORE_DECLARATION;
  150. this._sectionStart = this._index + 1;
  151. } else if(c === "?"){
  152. this._state = IN_PROCESSING_INSTRUCTION;
  153. this._sectionStart = this._index + 1;
  154. } else if(c === "<"){
  155. this._cbs.ontext(this._getSection());
  156. this._sectionStart = this._index;
  157. } else {
  158. this._state = (!this._xmlMode && (c === "s" || c === "S")) ?
  159. BEFORE_SPECIAL : IN_TAG_NAME;
  160. this._sectionStart = this._index;
  161. }
  162. };
  163. Tokenizer.prototype._stateInTagName = function(c){
  164. if(c === "/" || c === ">" || whitespace(c)){
  165. this._emitToken("onopentagname");
  166. this._state = BEFORE_ATTRIBUTE_NAME;
  167. this._index--;
  168. }
  169. };
  170. Tokenizer.prototype._stateBeforeCloseingTagName = function(c){
  171. if(whitespace(c));
  172. else if(c === ">"){
  173. this._state = TEXT;
  174. } else if(this._special !== SPECIAL_NONE){
  175. if(c === "s" || c === "S"){
  176. this._state = BEFORE_SPECIAL_END;
  177. } else {
  178. this._state = TEXT;
  179. this._index--;
  180. }
  181. } else {
  182. this._state = IN_CLOSING_TAG_NAME;
  183. this._sectionStart = this._index;
  184. }
  185. };
  186. Tokenizer.prototype._stateInCloseingTagName = function(c){
  187. if(c === ">" || whitespace(c)){
  188. this._emitToken("onclosetag");
  189. this._state = AFTER_CLOSING_TAG_NAME;
  190. this._index--;
  191. }
  192. };
  193. Tokenizer.prototype._stateAfterCloseingTagName = function(c){
  194. //skip everything until ">"
  195. if(c === ">"){
  196. this._state = TEXT;
  197. this._sectionStart = this._index + 1;
  198. }
  199. };
  200. Tokenizer.prototype._stateBeforeAttributeName = function(c){
  201. if(c === ">"){
  202. this._cbs.onopentagend();
  203. this._state = TEXT;
  204. this._sectionStart = this._index + 1;
  205. } else if(c === "/"){
  206. this._state = IN_SELF_CLOSING_TAG;
  207. } else if(!whitespace(c)){
  208. this._state = IN_ATTRIBUTE_NAME;
  209. this._sectionStart = this._index;
  210. }
  211. };
  212. Tokenizer.prototype._stateInSelfClosingTag = function(c){
  213. if(c === ">"){
  214. this._cbs.onselfclosingtag();
  215. this._state = TEXT;
  216. this._sectionStart = this._index + 1;
  217. } else if(!whitespace(c)){
  218. this._state = BEFORE_ATTRIBUTE_NAME;
  219. this._index--;
  220. }
  221. };
  222. Tokenizer.prototype._stateInAttributeName = function(c){
  223. if(c === "=" || c === "/" || c === ">" || whitespace(c)){
  224. this._cbs.onattribname(this._getSection());
  225. this._sectionStart = -1;
  226. this._state = AFTER_ATTRIBUTE_NAME;
  227. this._index--;
  228. }
  229. };
  230. Tokenizer.prototype._stateAfterAttributeName = function(c){
  231. if(c === "="){
  232. this._state = BEFORE_ATTRIBUTE_VALUE;
  233. } else if(c === "/" || c === ">"){
  234. this._cbs.onattribend();
  235. this._state = BEFORE_ATTRIBUTE_NAME;
  236. this._index--;
  237. } else if(!whitespace(c)){
  238. this._cbs.onattribend();
  239. this._state = IN_ATTRIBUTE_NAME;
  240. this._sectionStart = this._index;
  241. }
  242. };
  243. Tokenizer.prototype._stateBeforeAttributeValue = function(c){
  244. if(c === "\""){
  245. this._state = IN_ATTRIBUTE_VALUE_DQ;
  246. this._sectionStart = this._index + 1;
  247. } else if(c === "'"){
  248. this._state = IN_ATTRIBUTE_VALUE_SQ;
  249. this._sectionStart = this._index + 1;
  250. } else if(!whitespace(c)){
  251. this._state = IN_ATTRIBUTE_VALUE_NQ;
  252. this._sectionStart = this._index;
  253. this._index--; //reconsume token
  254. }
  255. };
  256. Tokenizer.prototype._stateInAttributeValueDoubleQuotes = function(c){
  257. if(c === "\""){
  258. this._emitToken("onattribdata");
  259. this._cbs.onattribend();
  260. this._state = BEFORE_ATTRIBUTE_NAME;
  261. } else if(this._decodeEntities && c === "&"){
  262. this._emitToken("onattribdata");
  263. this._baseState = this._state;
  264. this._state = BEFORE_ENTITY;
  265. this._sectionStart = this._index;
  266. }
  267. };
  268. Tokenizer.prototype._stateInAttributeValueSingleQuotes = function(c){
  269. if(c === "'"){
  270. this._emitToken("onattribdata");
  271. this._cbs.onattribend();
  272. this._state = BEFORE_ATTRIBUTE_NAME;
  273. } else if(this._decodeEntities && c === "&"){
  274. this._emitToken("onattribdata");
  275. this._baseState = this._state;
  276. this._state = BEFORE_ENTITY;
  277. this._sectionStart = this._index;
  278. }
  279. };
  280. Tokenizer.prototype._stateInAttributeValueNoQuotes = function(c){
  281. if(whitespace(c) || c === ">"){
  282. this._emitToken("onattribdata");
  283. this._cbs.onattribend();
  284. this._state = BEFORE_ATTRIBUTE_NAME;
  285. this._index--;
  286. } else if(this._decodeEntities && c === "&"){
  287. this._emitToken("onattribdata");
  288. this._baseState = this._state;
  289. this._state = BEFORE_ENTITY;
  290. this._sectionStart = this._index;
  291. }
  292. };
  293. Tokenizer.prototype._stateBeforeDeclaration = function(c){
  294. this._state = c === "[" ? BEFORE_CDATA_1 :
  295. c === "-" ? BEFORE_COMMENT :
  296. IN_DECLARATION;
  297. };
  298. Tokenizer.prototype._stateInDeclaration = function(c){
  299. if(c === ">"){
  300. this._cbs.ondeclaration(this._getSection());
  301. this._state = TEXT;
  302. this._sectionStart = this._index + 1;
  303. }
  304. };
  305. Tokenizer.prototype._stateInProcessingInstruction = function(c){
  306. if(c === ">"){
  307. this._cbs.onprocessinginstruction(this._getSection());
  308. this._state = TEXT;
  309. this._sectionStart = this._index + 1;
  310. }
  311. };
  312. Tokenizer.prototype._stateBeforeComment = function(c){
  313. if(c === "-"){
  314. this._state = IN_COMMENT;
  315. this._sectionStart = this._index + 1;
  316. } else {
  317. this._state = IN_DECLARATION;
  318. }
  319. };
  320. Tokenizer.prototype._stateInComment = function(c){
  321. if(c === "-") this._state = AFTER_COMMENT_1;
  322. };
  323. Tokenizer.prototype._stateAfterComment1 = function(c){
  324. if(c === "-"){
  325. this._state = AFTER_COMMENT_2;
  326. } else {
  327. this._state = IN_COMMENT;
  328. }
  329. };
  330. Tokenizer.prototype._stateAfterComment2 = function(c){
  331. if(c === ">"){
  332. //remove 2 trailing chars
  333. this._cbs.oncomment(this._buffer.substring(this._sectionStart, this._index - 2));
  334. this._state = TEXT;
  335. this._sectionStart = this._index + 1;
  336. } else if(c !== "-"){
  337. this._state = IN_COMMENT;
  338. }
  339. // else: stay in AFTER_COMMENT_2 (`--->`)
  340. };
  341. Tokenizer.prototype._stateBeforeCdata1 = ifElseState("C", BEFORE_CDATA_2, IN_DECLARATION);
  342. Tokenizer.prototype._stateBeforeCdata2 = ifElseState("D", BEFORE_CDATA_3, IN_DECLARATION);
  343. Tokenizer.prototype._stateBeforeCdata3 = ifElseState("A", BEFORE_CDATA_4, IN_DECLARATION);
  344. Tokenizer.prototype._stateBeforeCdata4 = ifElseState("T", BEFORE_CDATA_5, IN_DECLARATION);
  345. Tokenizer.prototype._stateBeforeCdata5 = ifElseState("A", BEFORE_CDATA_6, IN_DECLARATION);
  346. Tokenizer.prototype._stateBeforeCdata6 = function(c){
  347. if(c === "["){
  348. this._state = IN_CDATA;
  349. this._sectionStart = this._index + 1;
  350. } else {
  351. this._state = IN_DECLARATION;
  352. this._index--;
  353. }
  354. };
  355. Tokenizer.prototype._stateInCdata = function(c){
  356. if(c === "]") this._state = AFTER_CDATA_1;
  357. };
  358. Tokenizer.prototype._stateAfterCdata1 = characterState("]", AFTER_CDATA_2);
  359. Tokenizer.prototype._stateAfterCdata2 = function(c){
  360. if(c === ">"){
  361. //remove 2 trailing chars
  362. this._cbs.oncdata(this._buffer.substring(this._sectionStart, this._index - 2));
  363. this._state = TEXT;
  364. this._sectionStart = this._index + 1;
  365. } else if(c !== "]") {
  366. this._state = IN_CDATA;
  367. }
  368. //else: stay in AFTER_CDATA_2 (`]]]>`)
  369. };
  370. Tokenizer.prototype._stateBeforeSpecial = function(c){
  371. if(c === "c" || c === "C"){
  372. this._state = BEFORE_SCRIPT_1;
  373. } else if(c === "t" || c === "T"){
  374. this._state = BEFORE_STYLE_1;
  375. } else {
  376. this._state = IN_TAG_NAME;
  377. this._index--; //consume the token again
  378. }
  379. };
  380. Tokenizer.prototype._stateBeforeSpecialEnd = function(c){
  381. if(this._special === SPECIAL_SCRIPT && (c === "c" || c === "C")){
  382. this._state = AFTER_SCRIPT_1;
  383. } else if(this._special === SPECIAL_STYLE && (c === "t" || c === "T")){
  384. this._state = AFTER_STYLE_1;
  385. }
  386. else this._state = TEXT;
  387. };
  388. Tokenizer.prototype._stateBeforeScript1 = consumeSpecialNameChar("R", BEFORE_SCRIPT_2);
  389. Tokenizer.prototype._stateBeforeScript2 = consumeSpecialNameChar("I", BEFORE_SCRIPT_3);
  390. Tokenizer.prototype._stateBeforeScript3 = consumeSpecialNameChar("P", BEFORE_SCRIPT_4);
  391. Tokenizer.prototype._stateBeforeScript4 = consumeSpecialNameChar("T", BEFORE_SCRIPT_5);
  392. Tokenizer.prototype._stateBeforeScript5 = function(c){
  393. if(c === "/" || c === ">" || whitespace(c)){
  394. this._special = SPECIAL_SCRIPT;
  395. }
  396. this._state = IN_TAG_NAME;
  397. this._index--; //consume the token again
  398. };
  399. Tokenizer.prototype._stateAfterScript1 = ifElseState("R", AFTER_SCRIPT_2, TEXT);
  400. Tokenizer.prototype._stateAfterScript2 = ifElseState("I", AFTER_SCRIPT_3, TEXT);
  401. Tokenizer.prototype._stateAfterScript3 = ifElseState("P", AFTER_SCRIPT_4, TEXT);
  402. Tokenizer.prototype._stateAfterScript4 = ifElseState("T", AFTER_SCRIPT_5, TEXT);
  403. Tokenizer.prototype._stateAfterScript5 = function(c){
  404. if(c === ">" || whitespace(c)){
  405. this._special = SPECIAL_NONE;
  406. this._state = IN_CLOSING_TAG_NAME;
  407. this._sectionStart = this._index - 6;
  408. this._index--; //reconsume the token
  409. }
  410. else this._state = TEXT;
  411. };
  412. Tokenizer.prototype._stateBeforeStyle1 = consumeSpecialNameChar("Y", BEFORE_STYLE_2);
  413. Tokenizer.prototype._stateBeforeStyle2 = consumeSpecialNameChar("L", BEFORE_STYLE_3);
  414. Tokenizer.prototype._stateBeforeStyle3 = consumeSpecialNameChar("E", BEFORE_STYLE_4);
  415. Tokenizer.prototype._stateBeforeStyle4 = function(c){
  416. if(c === "/" || c === ">" || whitespace(c)){
  417. this._special = SPECIAL_STYLE;
  418. }
  419. this._state = IN_TAG_NAME;
  420. this._index--; //consume the token again
  421. };
  422. Tokenizer.prototype._stateAfterStyle1 = ifElseState("Y", AFTER_STYLE_2, TEXT);
  423. Tokenizer.prototype._stateAfterStyle2 = ifElseState("L", AFTER_STYLE_3, TEXT);
  424. Tokenizer.prototype._stateAfterStyle3 = ifElseState("E", AFTER_STYLE_4, TEXT);
  425. Tokenizer.prototype._stateAfterStyle4 = function(c){
  426. if(c === ">" || whitespace(c)){
  427. this._special = SPECIAL_NONE;
  428. this._state = IN_CLOSING_TAG_NAME;
  429. this._sectionStart = this._index - 5;
  430. this._index--; //reconsume the token
  431. }
  432. else this._state = TEXT;
  433. };
  434. Tokenizer.prototype._stateBeforeEntity = ifElseState("#", BEFORE_NUMERIC_ENTITY, IN_NAMED_ENTITY);
  435. Tokenizer.prototype._stateBeforeNumericEntity = ifElseState("X", IN_HEX_ENTITY, IN_NUMERIC_ENTITY);
  436. //for entities terminated with a semicolon
  437. Tokenizer.prototype._parseNamedEntityStrict = function(){
  438. //offset = 1
  439. if(this._sectionStart + 1 < this._index){
  440. var entity = this._buffer.substring(this._sectionStart + 1, this._index),
  441. map = this._xmlMode ? xmlMap : entityMap;
  442. if(map.hasOwnProperty(entity)){
  443. this._emitPartial(map[entity]);
  444. this._sectionStart = this._index + 1;
  445. }
  446. }
  447. };
  448. //parses legacy entities (without trailing semicolon)
  449. Tokenizer.prototype._parseLegacyEntity = function(){
  450. var start = this._sectionStart + 1,
  451. limit = this._index - start;
  452. if(limit > 6) limit = 6; //the max length of legacy entities is 6
  453. while(limit >= 2){ //the min length of legacy entities is 2
  454. var entity = this._buffer.substr(start, limit);
  455. if(legacyMap.hasOwnProperty(entity)){
  456. this._emitPartial(legacyMap[entity]);
  457. this._sectionStart += limit + 1;
  458. return;
  459. } else {
  460. limit--;
  461. }
  462. }
  463. };
  464. Tokenizer.prototype._stateInNamedEntity = function(c){
  465. if(c === ";"){
  466. this._parseNamedEntityStrict();
  467. if(this._sectionStart + 1 < this._index && !this._xmlMode){
  468. this._parseLegacyEntity();
  469. }
  470. this._state = this._baseState;
  471. } else if((c < "a" || c > "z") && (c < "A" || c > "Z") && (c < "0" || c > "9")){
  472. if(this._xmlMode);
  473. else if(this._sectionStart + 1 === this._index);
  474. else if(this._baseState !== TEXT){
  475. if(c !== "="){
  476. this._parseNamedEntityStrict();
  477. }
  478. } else {
  479. this._parseLegacyEntity();
  480. }
  481. this._state = this._baseState;
  482. this._index--;
  483. }
  484. };
  485. Tokenizer.prototype._decodeNumericEntity = function(offset, base){
  486. var sectionStart = this._sectionStart + offset;
  487. if(sectionStart !== this._index){
  488. //parse entity
  489. var entity = this._buffer.substring(sectionStart, this._index);
  490. var parsed = parseInt(entity, base);
  491. this._emitPartial(decodeCodePoint(parsed));
  492. this._sectionStart = this._index;
  493. } else {
  494. this._sectionStart--;
  495. }
  496. this._state = this._baseState;
  497. };
  498. Tokenizer.prototype._stateInNumericEntity = function(c){
  499. if(c === ";"){
  500. this._decodeNumericEntity(2, 10);
  501. this._sectionStart++;
  502. } else if(c < "0" || c > "9"){
  503. if(!this._xmlMode){
  504. this._decodeNumericEntity(2, 10);
  505. } else {
  506. this._state = this._baseState;
  507. }
  508. this._index--;
  509. }
  510. };
  511. Tokenizer.prototype._stateInHexEntity = function(c){
  512. if(c === ";"){
  513. this._decodeNumericEntity(3, 16);
  514. this._sectionStart++;
  515. } else if((c < "a" || c > "f") && (c < "A" || c > "F") && (c < "0" || c > "9")){
  516. if(!this._xmlMode){
  517. this._decodeNumericEntity(3, 16);
  518. } else {
  519. this._state = this._baseState;
  520. }
  521. this._index--;
  522. }
  523. };
  524. Tokenizer.prototype._cleanup = function (){
  525. if(this._sectionStart < 0){
  526. this._buffer = "";
  527. this._index = 0;
  528. this._bufferOffset += this._index;
  529. } else if(this._running){
  530. if(this._state === TEXT){
  531. if(this._sectionStart !== this._index){
  532. this._cbs.ontext(this._buffer.substr(this._sectionStart));
  533. }
  534. this._buffer = "";
  535. this._index = 0;
  536. this._bufferOffset += this._index;
  537. } else if(this._sectionStart === this._index){
  538. //the section just started
  539. this._buffer = "";
  540. this._index = 0;
  541. this._bufferOffset += this._index;
  542. } else {
  543. //remove everything unnecessary
  544. this._buffer = this._buffer.substr(this._sectionStart);
  545. this._index -= this._sectionStart;
  546. this._bufferOffset += this._sectionStart;
  547. }
  548. this._sectionStart = 0;
  549. }
  550. };
  551. //TODO make events conditional
  552. Tokenizer.prototype.write = function(chunk){
  553. if(this._ended) this._cbs.onerror(Error(".write() after done!"));
  554. this._buffer += chunk;
  555. this._parse();
  556. };
  557. Tokenizer.prototype._parse = function(){
  558. while(this._index < this._buffer.length && this._running){
  559. var c = this._buffer.charAt(this._index);
  560. if(this._state === TEXT) {
  561. this._stateText(c);
  562. } else if(this._state === BEFORE_TAG_NAME){
  563. this._stateBeforeTagName(c);
  564. } else if(this._state === IN_TAG_NAME) {
  565. this._stateInTagName(c);
  566. } else if(this._state === BEFORE_CLOSING_TAG_NAME){
  567. this._stateBeforeCloseingTagName(c);
  568. } else if(this._state === IN_CLOSING_TAG_NAME){
  569. this._stateInCloseingTagName(c);
  570. } else if(this._state === AFTER_CLOSING_TAG_NAME){
  571. this._stateAfterCloseingTagName(c);
  572. } else if(this._state === IN_SELF_CLOSING_TAG){
  573. this._stateInSelfClosingTag(c);
  574. }
  575. /*
  576. * attributes
  577. */
  578. else if(this._state === BEFORE_ATTRIBUTE_NAME){
  579. this._stateBeforeAttributeName(c);
  580. } else if(this._state === IN_ATTRIBUTE_NAME){
  581. this._stateInAttributeName(c);
  582. } else if(this._state === AFTER_ATTRIBUTE_NAME){
  583. this._stateAfterAttributeName(c);
  584. } else if(this._state === BEFORE_ATTRIBUTE_VALUE){
  585. this._stateBeforeAttributeValue(c);
  586. } else if(this._state === IN_ATTRIBUTE_VALUE_DQ){
  587. this._stateInAttributeValueDoubleQuotes(c);
  588. } else if(this._state === IN_ATTRIBUTE_VALUE_SQ){
  589. this._stateInAttributeValueSingleQuotes(c);
  590. } else if(this._state === IN_ATTRIBUTE_VALUE_NQ){
  591. this._stateInAttributeValueNoQuotes(c);
  592. }
  593. /*
  594. * declarations
  595. */
  596. else if(this._state === BEFORE_DECLARATION){
  597. this._stateBeforeDeclaration(c);
  598. } else if(this._state === IN_DECLARATION){
  599. this._stateInDeclaration(c);
  600. }
  601. /*
  602. * processing instructions
  603. */
  604. else if(this._state === IN_PROCESSING_INSTRUCTION){
  605. this._stateInProcessingInstruction(c);
  606. }
  607. /*
  608. * comments
  609. */
  610. else if(this._state === BEFORE_COMMENT){
  611. this._stateBeforeComment(c);
  612. } else if(this._state === IN_COMMENT){
  613. this._stateInComment(c);
  614. } else if(this._state === AFTER_COMMENT_1){
  615. this._stateAfterComment1(c);
  616. } else if(this._state === AFTER_COMMENT_2){
  617. this._stateAfterComment2(c);
  618. }
  619. /*
  620. * cdata
  621. */
  622. else if(this._state === BEFORE_CDATA_1){
  623. this._stateBeforeCdata1(c);
  624. } else if(this._state === BEFORE_CDATA_2){
  625. this._stateBeforeCdata2(c);
  626. } else if(this._state === BEFORE_CDATA_3){
  627. this._stateBeforeCdata3(c);
  628. } else if(this._state === BEFORE_CDATA_4){
  629. this._stateBeforeCdata4(c);
  630. } else if(this._state === BEFORE_CDATA_5){
  631. this._stateBeforeCdata5(c);
  632. } else if(this._state === BEFORE_CDATA_6){
  633. this._stateBeforeCdata6(c);
  634. } else if(this._state === IN_CDATA){
  635. this._stateInCdata(c);
  636. } else if(this._state === AFTER_CDATA_1){
  637. this._stateAfterCdata1(c);
  638. } else if(this._state === AFTER_CDATA_2){
  639. this._stateAfterCdata2(c);
  640. }
  641. /*
  642. * special tags
  643. */
  644. else if(this._state === BEFORE_SPECIAL){
  645. this._stateBeforeSpecial(c);
  646. } else if(this._state === BEFORE_SPECIAL_END){
  647. this._stateBeforeSpecialEnd(c);
  648. }
  649. /*
  650. * script
  651. */
  652. else if(this._state === BEFORE_SCRIPT_1){
  653. this._stateBeforeScript1(c);
  654. } else if(this._state === BEFORE_SCRIPT_2){
  655. this._stateBeforeScript2(c);
  656. } else if(this._state === BEFORE_SCRIPT_3){
  657. this._stateBeforeScript3(c);
  658. } else if(this._state === BEFORE_SCRIPT_4){
  659. this._stateBeforeScript4(c);
  660. } else if(this._state === BEFORE_SCRIPT_5){
  661. this._stateBeforeScript5(c);
  662. }
  663. else if(this._state === AFTER_SCRIPT_1){
  664. this._stateAfterScript1(c);
  665. } else if(this._state === AFTER_SCRIPT_2){
  666. this._stateAfterScript2(c);
  667. } else if(this._state === AFTER_SCRIPT_3){
  668. this._stateAfterScript3(c);
  669. } else if(this._state === AFTER_SCRIPT_4){
  670. this._stateAfterScript4(c);
  671. } else if(this._state === AFTER_SCRIPT_5){
  672. this._stateAfterScript5(c);
  673. }
  674. /*
  675. * style
  676. */
  677. else if(this._state === BEFORE_STYLE_1){
  678. this._stateBeforeStyle1(c);
  679. } else if(this._state === BEFORE_STYLE_2){
  680. this._stateBeforeStyle2(c);
  681. } else if(this._state === BEFORE_STYLE_3){
  682. this._stateBeforeStyle3(c);
  683. } else if(this._state === BEFORE_STYLE_4){
  684. this._stateBeforeStyle4(c);
  685. }
  686. else if(this._state === AFTER_STYLE_1){
  687. this._stateAfterStyle1(c);
  688. } else if(this._state === AFTER_STYLE_2){
  689. this._stateAfterStyle2(c);
  690. } else if(this._state === AFTER_STYLE_3){
  691. this._stateAfterStyle3(c);
  692. } else if(this._state === AFTER_STYLE_4){
  693. this._stateAfterStyle4(c);
  694. }
  695. /*
  696. * entities
  697. */
  698. else if(this._state === BEFORE_ENTITY){
  699. this._stateBeforeEntity(c);
  700. } else if(this._state === BEFORE_NUMERIC_ENTITY){
  701. this._stateBeforeNumericEntity(c);
  702. } else if(this._state === IN_NAMED_ENTITY){
  703. this._stateInNamedEntity(c);
  704. } else if(this._state === IN_NUMERIC_ENTITY){
  705. this._stateInNumericEntity(c);
  706. } else if(this._state === IN_HEX_ENTITY){
  707. this._stateInHexEntity(c);
  708. }
  709. else {
  710. this._cbs.onerror(Error("unknown _state"), this._state);
  711. }
  712. this._index++;
  713. }
  714. this._cleanup();
  715. };
  716. Tokenizer.prototype.pause = function(){
  717. this._running = false;
  718. };
  719. Tokenizer.prototype.resume = function(){
  720. this._running = true;
  721. if(this._index < this._buffer.length){
  722. this._parse();
  723. }
  724. if(this._ended){
  725. this._finish();
  726. }
  727. };
  728. Tokenizer.prototype.end = function(chunk){
  729. if(this._ended) this._cbs.onerror(Error(".end() after done!"));
  730. if(chunk) this.write(chunk);
  731. this._ended = true;
  732. if(this._running) this._finish();
  733. };
  734. Tokenizer.prototype._finish = function(){
  735. //if there is remaining data, emit it in a reasonable way
  736. if(this._sectionStart < this._index){
  737. this._handleTrailingData();
  738. }
  739. this._cbs.onend();
  740. };
  741. Tokenizer.prototype._handleTrailingData = function(){
  742. var data = this._buffer.substr(this._sectionStart);
  743. if(this._state === IN_CDATA || this._state === AFTER_CDATA_1 || this._state === AFTER_CDATA_2){
  744. this._cbs.oncdata(data);
  745. } else if(this._state === IN_COMMENT || this._state === AFTER_COMMENT_1 || this._state === AFTER_COMMENT_2){
  746. this._cbs.oncomment(data);
  747. } else if(this._state === IN_NAMED_ENTITY && !this._xmlMode){
  748. this._parseLegacyEntity();
  749. if(this._sectionStart < this._index){
  750. this._state = this._baseState;
  751. this._handleTrailingData();
  752. }
  753. } else if(this._state === IN_NUMERIC_ENTITY && !this._xmlMode){
  754. this._decodeNumericEntity(2, 10);
  755. if(this._sectionStart < this._index){
  756. this._state = this._baseState;
  757. this._handleTrailingData();
  758. }
  759. } else if(this._state === IN_HEX_ENTITY && !this._xmlMode){
  760. this._decodeNumericEntity(3, 16);
  761. if(this._sectionStart < this._index){
  762. this._state = this._baseState;
  763. this._handleTrailingData();
  764. }
  765. } else if(
  766. this._state !== IN_TAG_NAME &&
  767. this._state !== BEFORE_ATTRIBUTE_NAME &&
  768. this._state !== BEFORE_ATTRIBUTE_VALUE &&
  769. this._state !== AFTER_ATTRIBUTE_NAME &&
  770. this._state !== IN_ATTRIBUTE_NAME &&
  771. this._state !== IN_ATTRIBUTE_VALUE_SQ &&
  772. this._state !== IN_ATTRIBUTE_VALUE_DQ &&
  773. this._state !== IN_ATTRIBUTE_VALUE_NQ &&
  774. this._state !== IN_CLOSING_TAG_NAME
  775. ){
  776. this._cbs.ontext(data);
  777. }
  778. //else, ignore remaining data
  779. //TODO add a way to remove current tag
  780. };
  781. Tokenizer.prototype.reset = function(){
  782. Tokenizer.call(this, {xmlMode: this._xmlMode, decodeEntities: this._decodeEntities}, this._cbs);
  783. };
  784. Tokenizer.prototype.getAbsoluteIndex = function(){
  785. return this._bufferOffset + this._index;
  786. };
  787. Tokenizer.prototype._getSection = function(){
  788. return this._buffer.substring(this._sectionStart, this._index);
  789. };
  790. Tokenizer.prototype._emitToken = function(name){
  791. this._cbs[name](this._getSection());
  792. this._sectionStart = -1;
  793. };
  794. Tokenizer.prototype._emitPartial = function(value){
  795. if(this._baseState !== TEXT){
  796. this._cbs.onattribdata(value); //TODO implement the new event
  797. } else {
  798. this._cbs.ontext(value);
  799. }
  800. };