1. /*
  2. * Copyright 2002-2004 the original author or authors.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. package org.springframework.web.util;
  17. import java.util.HashMap;
  18. import java.util.Map;
  19. /**
  20. * Utility class for HTML escaping. Escapes and unescapes
  21. * based on the W3C HTML 4.01 recommendation.
  22. *
  23. * <p>Reference:
  24. * <a href="http://www.w3.org/TR/html4/charset.html">
  25. * http://www.w3.org/TR/html4/charset.html
  26. * </a>
  27. *
  28. * @author Chris Wilson
  29. * @author Juergen Hoeller
  30. * @since 01.03.2003
  31. */
  32. public abstract class HtmlUtils {
  33. private static final String EMPTY_REFERENCE = "&;";
  34. private static final String REFERENCE_START = "&#";
  35. private static final String MALFORMED_REFERENCE = "&#;";
  36. private static final Map ENTITIES = new HashMap();
  37. static {
  38. ENTITIES.put("nbsp", new Integer(160));
  39. ENTITIES.put("iexcl", new Integer(161));
  40. ENTITIES.put("cent", new Integer(162));
  41. ENTITIES.put("pound", new Integer(163));
  42. ENTITIES.put("curren", new Integer(164));
  43. ENTITIES.put("yen", new Integer(165));
  44. ENTITIES.put("brvbar", new Integer(166));
  45. ENTITIES.put("sect", new Integer(167));
  46. ENTITIES.put("uml", new Integer(168));
  47. ENTITIES.put("copy", new Integer(169));
  48. ENTITIES.put("ordf", new Integer(170));
  49. ENTITIES.put("laquo", new Integer(171));
  50. ENTITIES.put("not", new Integer(172));
  51. ENTITIES.put("shy", new Integer(173));
  52. ENTITIES.put("reg", new Integer(174));
  53. ENTITIES.put("macr", new Integer(175));
  54. ENTITIES.put("deg", new Integer(176));
  55. ENTITIES.put("plusmn", new Integer(177));
  56. ENTITIES.put("sup2", new Integer(178));
  57. ENTITIES.put("sup3", new Integer(179));
  58. ENTITIES.put("acute", new Integer(180));
  59. ENTITIES.put("micro", new Integer(181));
  60. ENTITIES.put("para", new Integer(182));
  61. ENTITIES.put("middot", new Integer(183));
  62. ENTITIES.put("cedil", new Integer(184));
  63. ENTITIES.put("sup1", new Integer(185));
  64. ENTITIES.put("ordm", new Integer(186));
  65. ENTITIES.put("raquo", new Integer(187));
  66. ENTITIES.put("frac14", new Integer(188));
  67. ENTITIES.put("frac12", new Integer(189));
  68. ENTITIES.put("frac34", new Integer(190));
  69. ENTITIES.put("iquest", new Integer(191));
  70. ENTITIES.put("Agrave", new Integer(192));
  71. ENTITIES.put("Aacute", new Integer(193));
  72. ENTITIES.put("Acirc", new Integer(194));
  73. ENTITIES.put("Atilde", new Integer(195));
  74. ENTITIES.put("Auml", new Integer(196));
  75. ENTITIES.put("Aring", new Integer(197));
  76. ENTITIES.put("AElig", new Integer(198));
  77. ENTITIES.put("Ccedil", new Integer(199));
  78. ENTITIES.put("Egrave", new Integer(200));
  79. ENTITIES.put("Eacute", new Integer(201));
  80. ENTITIES.put("Ecirc", new Integer(202));
  81. ENTITIES.put("Euml", new Integer(203));
  82. ENTITIES.put("Igrave", new Integer(204));
  83. ENTITIES.put("Iacute", new Integer(205));
  84. ENTITIES.put("Icirc", new Integer(206));
  85. ENTITIES.put("Iuml", new Integer(207));
  86. ENTITIES.put("ETH", new Integer(208));
  87. ENTITIES.put("Ntilde", new Integer(209));
  88. ENTITIES.put("Ograve", new Integer(210));
  89. ENTITIES.put("Oacute", new Integer(211));
  90. ENTITIES.put("Ocirc", new Integer(212));
  91. ENTITIES.put("Otilde", new Integer(213));
  92. ENTITIES.put("Ouml", new Integer(214));
  93. ENTITIES.put("times", new Integer(215));
  94. ENTITIES.put("Oslash", new Integer(216));
  95. ENTITIES.put("Ugrave", new Integer(217));
  96. ENTITIES.put("Uacute", new Integer(218));
  97. ENTITIES.put("Ucirc", new Integer(219));
  98. ENTITIES.put("Uuml", new Integer(220));
  99. ENTITIES.put("Yacute", new Integer(221));
  100. ENTITIES.put("THORN", new Integer(222));
  101. ENTITIES.put("szlig", new Integer(223));
  102. ENTITIES.put("agrave", new Integer(224));
  103. ENTITIES.put("aacute", new Integer(225));
  104. ENTITIES.put("acirc", new Integer(226));
  105. ENTITIES.put("atilde", new Integer(227));
  106. ENTITIES.put("auml", new Integer(228));
  107. ENTITIES.put("aring", new Integer(229));
  108. ENTITIES.put("aelig", new Integer(230));
  109. ENTITIES.put("ccedil", new Integer(231));
  110. ENTITIES.put("egrave", new Integer(232));
  111. ENTITIES.put("eacute", new Integer(233));
  112. ENTITIES.put("ecirc", new Integer(234));
  113. ENTITIES.put("euml", new Integer(235));
  114. ENTITIES.put("igrave", new Integer(236));
  115. ENTITIES.put("iacute", new Integer(237));
  116. ENTITIES.put("icirc", new Integer(238));
  117. ENTITIES.put("iuml", new Integer(239));
  118. ENTITIES.put("eth", new Integer(240));
  119. ENTITIES.put("ntilde", new Integer(241));
  120. ENTITIES.put("ograve", new Integer(242));
  121. ENTITIES.put("oacute", new Integer(243));
  122. ENTITIES.put("ocirc", new Integer(244));
  123. ENTITIES.put("otilde", new Integer(245));
  124. ENTITIES.put("ouml", new Integer(246));
  125. ENTITIES.put("divide", new Integer(247));
  126. ENTITIES.put("oslash", new Integer(248));
  127. ENTITIES.put("ugrave", new Integer(249));
  128. ENTITIES.put("uacute", new Integer(250));
  129. ENTITIES.put("ucirc", new Integer(251));
  130. ENTITIES.put("uuml", new Integer(252));
  131. ENTITIES.put("yacute", new Integer(253));
  132. ENTITIES.put("thorn", new Integer(254));
  133. ENTITIES.put("yuml", new Integer(255));
  134. ENTITIES.put("fnof", new Integer(402));
  135. ENTITIES.put("Alpha", new Integer(913));
  136. ENTITIES.put("Beta", new Integer(914));
  137. ENTITIES.put("Gamma", new Integer(915));
  138. ENTITIES.put("Delta", new Integer(916));
  139. ENTITIES.put("Epsilon", new Integer(917));
  140. ENTITIES.put("Zeta", new Integer(918));
  141. ENTITIES.put("Eta", new Integer(919));
  142. ENTITIES.put("Theta", new Integer(920));
  143. ENTITIES.put("Iota", new Integer(921));
  144. ENTITIES.put("Kappa", new Integer(922));
  145. ENTITIES.put("Lambda", new Integer(923));
  146. ENTITIES.put("Mu", new Integer(924));
  147. ENTITIES.put("Nu", new Integer(925));
  148. ENTITIES.put("Xi", new Integer(926));
  149. ENTITIES.put("Omicron", new Integer(927));
  150. ENTITIES.put("Pi", new Integer(928));
  151. ENTITIES.put("Rho", new Integer(929));
  152. ENTITIES.put("Sigma", new Integer(931));
  153. ENTITIES.put("Tau", new Integer(932));
  154. ENTITIES.put("Upsilon", new Integer(933));
  155. ENTITIES.put("Phi", new Integer(934));
  156. ENTITIES.put("Chi", new Integer(935));
  157. ENTITIES.put("Psi", new Integer(936));
  158. ENTITIES.put("Omega", new Integer(937));
  159. ENTITIES.put("alpha", new Integer(945));
  160. ENTITIES.put("beta", new Integer(946));
  161. ENTITIES.put("gamma", new Integer(947));
  162. ENTITIES.put("delta", new Integer(948));
  163. ENTITIES.put("epsilon", new Integer(949));
  164. ENTITIES.put("zeta", new Integer(950));
  165. ENTITIES.put("eta", new Integer(951));
  166. ENTITIES.put("theta", new Integer(952));
  167. ENTITIES.put("iota", new Integer(953));
  168. ENTITIES.put("kappa", new Integer(954));
  169. ENTITIES.put("lambda", new Integer(955));
  170. ENTITIES.put("mu", new Integer(956));
  171. ENTITIES.put("nu", new Integer(957));
  172. ENTITIES.put("xi", new Integer(958));
  173. ENTITIES.put("omicron", new Integer(959));
  174. ENTITIES.put("pi", new Integer(960));
  175. ENTITIES.put("rho", new Integer(961));
  176. ENTITIES.put("sigmaf", new Integer(962));
  177. ENTITIES.put("sigma", new Integer(963));
  178. ENTITIES.put("tau", new Integer(964));
  179. ENTITIES.put("upsilon", new Integer(965));
  180. ENTITIES.put("phi", new Integer(966));
  181. ENTITIES.put("chi", new Integer(967));
  182. ENTITIES.put("psi", new Integer(968));
  183. ENTITIES.put("omega", new Integer(969));
  184. ENTITIES.put("thetasym", new Integer(977));
  185. ENTITIES.put("upsih", new Integer(978));
  186. ENTITIES.put("piv", new Integer(982));
  187. ENTITIES.put("bull", new Integer(8226));
  188. ENTITIES.put("hellip", new Integer(8230));
  189. ENTITIES.put("prime", new Integer(8242));
  190. ENTITIES.put("Prime", new Integer(8243));
  191. ENTITIES.put("oline", new Integer(8254));
  192. ENTITIES.put("frasl", new Integer(8260));
  193. ENTITIES.put("weierp", new Integer(8472));
  194. ENTITIES.put("image", new Integer(8465));
  195. ENTITIES.put("real", new Integer(8476));
  196. ENTITIES.put("trade", new Integer(8482));
  197. ENTITIES.put("alefsym", new Integer(8501));
  198. ENTITIES.put("larr", new Integer(8592));
  199. ENTITIES.put("uarr", new Integer(8593));
  200. ENTITIES.put("rarr", new Integer(8594));
  201. ENTITIES.put("darr", new Integer(8595));
  202. ENTITIES.put("harr", new Integer(8596));
  203. ENTITIES.put("crarr", new Integer(8629));
  204. ENTITIES.put("lArr", new Integer(8656));
  205. ENTITIES.put("uArr", new Integer(8657));
  206. ENTITIES.put("rArr", new Integer(8658));
  207. ENTITIES.put("dArr", new Integer(8659));
  208. ENTITIES.put("hArr", new Integer(8660));
  209. ENTITIES.put("forall", new Integer(8704));
  210. ENTITIES.put("part", new Integer(8706));
  211. ENTITIES.put("exist", new Integer(8707));
  212. ENTITIES.put("empty", new Integer(8709));
  213. ENTITIES.put("nabla", new Integer(8711));
  214. ENTITIES.put("isin", new Integer(8712));
  215. ENTITIES.put("notin", new Integer(8713));
  216. ENTITIES.put("ni", new Integer(8715));
  217. ENTITIES.put("prod", new Integer(8719));
  218. ENTITIES.put("sum", new Integer(8721));
  219. ENTITIES.put("minus", new Integer(8722));
  220. ENTITIES.put("lowast", new Integer(8727));
  221. ENTITIES.put("radic", new Integer(8730));
  222. ENTITIES.put("prop", new Integer(8733));
  223. ENTITIES.put("infin", new Integer(8734));
  224. ENTITIES.put("ang", new Integer(8736));
  225. ENTITIES.put("and", new Integer(8743));
  226. ENTITIES.put("or", new Integer(8744));
  227. ENTITIES.put("cap", new Integer(8745));
  228. ENTITIES.put("cup", new Integer(8746));
  229. ENTITIES.put("int", new Integer(8747));
  230. ENTITIES.put("there4", new Integer(8756));
  231. ENTITIES.put("sim", new Integer(8764));
  232. ENTITIES.put("cong", new Integer(8773));
  233. ENTITIES.put("asymp", new Integer(8776));
  234. ENTITIES.put("ne", new Integer(8800));
  235. ENTITIES.put("equiv", new Integer(8801));
  236. ENTITIES.put("le", new Integer(8804));
  237. ENTITIES.put("ge", new Integer(8805));
  238. ENTITIES.put("sub", new Integer(8834));
  239. ENTITIES.put("sup", new Integer(8835));
  240. ENTITIES.put("nsub", new Integer(8836));
  241. ENTITIES.put("sube", new Integer(8838));
  242. ENTITIES.put("supe", new Integer(8839));
  243. ENTITIES.put("oplus", new Integer(8853));
  244. ENTITIES.put("otimes", new Integer(8855));
  245. ENTITIES.put("perp", new Integer(8869));
  246. ENTITIES.put("sdot", new Integer(8901));
  247. ENTITIES.put("lceil", new Integer(8968));
  248. ENTITIES.put("rceil", new Integer(8969));
  249. ENTITIES.put("lfloor", new Integer(8970));
  250. ENTITIES.put("rfloor", new Integer(8971));
  251. ENTITIES.put("lang", new Integer(9001));
  252. ENTITIES.put("rang", new Integer(9002));
  253. ENTITIES.put("loz", new Integer(9674));
  254. ENTITIES.put("spades", new Integer(9824));
  255. ENTITIES.put("clubs", new Integer(9827));
  256. ENTITIES.put("hearts", new Integer(9829));
  257. ENTITIES.put("diams", new Integer(9830));
  258. ENTITIES.put("quot", new Integer(34));
  259. ENTITIES.put("amp", new Integer(38));
  260. ENTITIES.put("lt", new Integer(60));
  261. ENTITIES.put("gt", new Integer(62));
  262. ENTITIES.put("OElig", new Integer(338));
  263. ENTITIES.put("oelig", new Integer(339));
  264. ENTITIES.put("Scaron", new Integer(352));
  265. ENTITIES.put("scaron", new Integer(353));
  266. ENTITIES.put("Yuml", new Integer(376));
  267. ENTITIES.put("circ", new Integer(710));
  268. ENTITIES.put("tilde", new Integer(732));
  269. ENTITIES.put("ensp", new Integer(8194));
  270. ENTITIES.put("emsp", new Integer(8195));
  271. ENTITIES.put("thinsp", new Integer(8201));
  272. ENTITIES.put("zwnj", new Integer(8204));
  273. ENTITIES.put("zwj", new Integer(8205));
  274. ENTITIES.put("lrm", new Integer(8206));
  275. ENTITIES.put("rlm", new Integer(8207));
  276. ENTITIES.put("ndash", new Integer(8211));
  277. ENTITIES.put("mdash", new Integer(8212));
  278. ENTITIES.put("lsquo", new Integer(8216));
  279. ENTITIES.put("rsquo", new Integer(8217));
  280. ENTITIES.put("sbquo", new Integer(8218));
  281. ENTITIES.put("ldquo", new Integer(8220));
  282. ENTITIES.put("rdquo", new Integer(8221));
  283. ENTITIES.put("bdquo", new Integer(8222));
  284. ENTITIES.put("dagger", new Integer(8224));
  285. ENTITIES.put("Dagger", new Integer(8225));
  286. ENTITIES.put("permil", new Integer(8240));
  287. ENTITIES.put("lsaquo", new Integer(8249));
  288. ENTITIES.put("rsaquo", new Integer(8250));
  289. ENTITIES.put("euro", new Integer(8364));
  290. }
  291. /**
  292. * Turn special characters into HTML character references.
  293. * Handles complete character set defined in HTML 4.01 recommendation.
  294. * <p>Escapes all special characters to their corresponding numerial reference
  295. * in the decimal format: &#<i>Decimal</i>
  296. * <p>Reference:
  297. * <a href="http://www.w3.org/TR/html4/sgml/entities.html">
  298. * http://www.w3.org/TR/html4/sgml/entities.html
  299. * </a>
  300. */
  301. public static String htmlEscape(String s) {
  302. if (s == null) {
  303. return null;
  304. }
  305. StringBuffer escaped = new StringBuffer(s.length());
  306. for (int i = 0; i < s.length(); i++) {
  307. char c = s.charAt(i);
  308. // handle non special ASCII chars first since they will be most common
  309. if ((c >= 0 && c <= 33)
  310. || (c >= 35 && c <= 37)
  311. || (c >= 39 && c <= 59)
  312. || (c == 61)
  313. || (c >= 63 && c <= 159)) {
  314. escaped.append(c);
  315. continue;
  316. }
  317. // handle special chars
  318. if (c == 34) {
  319. writeDecimalReference(c, escaped);
  320. continue;
  321. }
  322. if (c == 38) {
  323. writeDecimalReference(c, escaped);
  324. continue;
  325. }
  326. if (c == 60) {
  327. writeDecimalReference(c, escaped);
  328. continue;
  329. }
  330. if (c == 62) {
  331. writeDecimalReference(c, escaped);
  332. continue;
  333. }
  334. if (c >= 160 && c <= 255) {
  335. writeDecimalReference(c, escaped);
  336. continue;
  337. }
  338. if (c >= 338 && c <= 339) {
  339. writeDecimalReference(c, escaped);
  340. continue;
  341. }
  342. if (c >= 352 && c <= 353) {
  343. writeDecimalReference(c, escaped);
  344. continue;
  345. }
  346. if (c == 376) {
  347. writeDecimalReference(c, escaped);
  348. continue;
  349. }
  350. if (c == 402) {
  351. writeDecimalReference(c, escaped);
  352. continue;
  353. }
  354. if (c == 710) {
  355. writeDecimalReference(c, escaped);
  356. continue;
  357. }
  358. if (c == 732) {
  359. writeDecimalReference(c, escaped);
  360. continue;
  361. }
  362. if (c >= 913 && c <= 929) {
  363. writeDecimalReference(c, escaped);
  364. continue;
  365. }
  366. if (c >= 931 && c <= 937) {
  367. writeDecimalReference(c, escaped);
  368. continue;
  369. }
  370. if (c >= 945 && c <= 969) {
  371. writeDecimalReference(c, escaped);
  372. continue;
  373. }
  374. if (c >= 977 && c <= 978) {
  375. writeDecimalReference(c, escaped);
  376. continue;
  377. }
  378. if (c == 982) {
  379. writeDecimalReference(c, escaped);
  380. continue;
  381. }
  382. if (c >= 8194 && c <= 8195) {
  383. writeDecimalReference(c, escaped);
  384. continue;
  385. }
  386. if (c == 8201) {
  387. writeDecimalReference(c, escaped);
  388. continue;
  389. }
  390. if (c >= 8204 && c <= 8207) {
  391. writeDecimalReference(c, escaped);
  392. continue;
  393. }
  394. if (c >= 8211 && c <= 8212) {
  395. writeDecimalReference(c, escaped);
  396. continue;
  397. }
  398. if (c >= 8216 && c <= 8218) {
  399. writeDecimalReference(c, escaped);
  400. continue;
  401. }
  402. if (c >= 8220 && c <= 8222) {
  403. writeDecimalReference(c, escaped);
  404. continue;
  405. }
  406. if (c >= 8224 && c <= 8226) {
  407. writeDecimalReference(c, escaped);
  408. continue;
  409. }
  410. if (c == 8230) {
  411. writeDecimalReference(c, escaped);
  412. continue;
  413. }
  414. if (c == 8240) {
  415. writeDecimalReference(c, escaped);
  416. continue;
  417. }
  418. if (c >= 8242 && c <= 8243) {
  419. writeDecimalReference(c, escaped);
  420. continue;
  421. }
  422. if (c >= 8249 && c <= 8250) {
  423. writeDecimalReference(c, escaped);
  424. continue;
  425. }
  426. if (c == 8254) {
  427. writeDecimalReference(c, escaped);
  428. continue;
  429. }
  430. if (c == 8260) {
  431. writeDecimalReference(c, escaped);
  432. continue;
  433. }
  434. if (c == 8364) {
  435. writeDecimalReference(c, escaped);
  436. continue;
  437. }
  438. if (c == 8465) {
  439. writeDecimalReference(c, escaped);
  440. continue;
  441. }
  442. if (c == 8472) {
  443. writeDecimalReference(c, escaped);
  444. continue;
  445. }
  446. if (c == 8476) {
  447. writeDecimalReference(c, escaped);
  448. continue;
  449. }
  450. if (c == 8482) {
  451. writeDecimalReference(c, escaped);
  452. continue;
  453. }
  454. if (c == 8501) {
  455. writeDecimalReference(c, escaped);
  456. continue;
  457. }
  458. if (c >= 8592 && c <= 8596) {
  459. writeDecimalReference(c, escaped);
  460. continue;
  461. }
  462. if (c == 8629) {
  463. writeDecimalReference(c, escaped);
  464. continue;
  465. }
  466. if (c >= 8656 && c <= 8660) {
  467. writeDecimalReference(c, escaped);
  468. continue;
  469. }
  470. if (c == 8704) {
  471. writeDecimalReference(c, escaped);
  472. continue;
  473. }
  474. if (c >= 8706 && c <= 8707) {
  475. writeDecimalReference(c, escaped);
  476. continue;
  477. }
  478. if (c == 8709) {
  479. writeDecimalReference(c, escaped);
  480. continue;
  481. }
  482. if (c >= 8711 && c <= 8713) {
  483. writeDecimalReference(c, escaped);
  484. continue;
  485. }
  486. if (c == 8715) {
  487. writeDecimalReference(c, escaped);
  488. continue;
  489. }
  490. if (c == 8719) {
  491. writeDecimalReference(c, escaped);
  492. continue;
  493. }
  494. if (c >= 8721 && c <= 8722) {
  495. writeDecimalReference(c, escaped);
  496. continue;
  497. }
  498. if (c == 8727) {
  499. writeDecimalReference(c, escaped);
  500. continue;
  501. }
  502. if (c == 8730) {
  503. writeDecimalReference(c, escaped);
  504. continue;
  505. }
  506. if (c >= 8733 && c <= 8734) {
  507. writeDecimalReference(c, escaped);
  508. continue;
  509. }
  510. if (c == 8736) {
  511. writeDecimalReference(c, escaped);
  512. continue;
  513. }
  514. if (c >= 8743 && c <= 8747) {
  515. writeDecimalReference(c, escaped);
  516. continue;
  517. }
  518. if (c == 8756) {
  519. writeDecimalReference(c, escaped);
  520. continue;
  521. }
  522. if (c == 8764) {
  523. writeDecimalReference(c, escaped);
  524. continue;
  525. }
  526. if (c == 8773) {
  527. writeDecimalReference(c, escaped);
  528. continue;
  529. }
  530. if (c == 8776) {
  531. writeDecimalReference(c, escaped);
  532. continue;
  533. }
  534. if (c >= 8800 && c <= 8801) {
  535. writeDecimalReference(c, escaped);
  536. continue;
  537. }
  538. if (c >= 8804 && c <= 8805) {
  539. writeDecimalReference(c, escaped);
  540. continue;
  541. }
  542. if (c >= 8834 && c <= 8836) {
  543. writeDecimalReference(c, escaped);
  544. continue;
  545. }
  546. if (c >= 8838 && c <= 8839) {
  547. writeDecimalReference(c, escaped);
  548. continue;
  549. }
  550. if (c == 8853) {
  551. writeDecimalReference(c, escaped);
  552. continue;
  553. }
  554. if (c == 8855) {
  555. writeDecimalReference(c, escaped);
  556. continue;
  557. }
  558. if (c == 8869) {
  559. writeDecimalReference(c, escaped);
  560. continue;
  561. }
  562. if (c == 8901) {
  563. writeDecimalReference(c, escaped);
  564. continue;
  565. }
  566. if (c >= 8968 && c <= 8971) {
  567. writeDecimalReference(c, escaped);
  568. continue;
  569. }
  570. if (c >= 9001 && c <= 9002) {
  571. writeDecimalReference(c, escaped);
  572. continue;
  573. }
  574. if (c == 9674) {
  575. writeDecimalReference(c, escaped);
  576. continue;
  577. }
  578. if (c == 9824) {
  579. writeDecimalReference(c, escaped);
  580. continue;
  581. }
  582. if (c == 9827) {
  583. writeDecimalReference(c, escaped);
  584. continue;
  585. }
  586. // all other chars
  587. escaped.append(c);
  588. }
  589. return escaped.toString();
  590. }
  591. /**
  592. * Turn HTML character references into their plain text UNICODE equivalent.
  593. * <p>Handles complete character set defined in HTML 4.01 recommendation
  594. * and all reference types (decimal, hex, and entity).
  595. * <p>Correctly converts the following formats:
  596. * <blockquote>
  597. * &#<i>Decimal</i> - <i>(Example: &#68;)</i><br>
  598. * &#x<i>Hex</i> - <i>(Example: &#xE5;) case insensitive</i><br>
  599. * &#<i>Entity</i> - <i>(Example: &amp;) case sensitive</i>
  600. * </blockquote>
  601. * Gracefully handles malformed character references by copying original
  602. * characters as is when encountered.<p>
  603. * <p>Reference:
  604. * <a href="http://www.w3.org/TR/html4/sgml/entities.html">
  605. * http://www.w3.org/TR/html4/sgml/entities.html
  606. * </a>
  607. */
  608. public static String htmlUnescape(String s) {
  609. if (s == null) {
  610. return null;
  611. }
  612. StringBuffer unescaped = new StringBuffer(s.length());
  613. for (int i = 0; i < s.length(); i++) {
  614. char c = s.charAt(i);
  615. if (c == '&') {
  616. // don't look more than 12 chars ahead as reference like strings
  617. // should not be longer than 12 chars in length (including ';')
  618. // prevents the entire string from being searched when an '&'
  619. // with no following ';' is an encountered
  620. int start = Math.min(i + 1, s.length() - 1);
  621. int end = Math.min(s.length(), start + 12);
  622. String reference = s.substring(start, end);
  623. int semi = reference.indexOf(';');
  624. if (semi == -1) {
  625. unescaped.append(c);
  626. continue;
  627. }
  628. reference = reference.substring(0, semi);
  629. i = start + semi;
  630. // try entity reference first
  631. Integer iso = (Integer) ENTITIES.get(reference);
  632. if (iso != null) {
  633. unescaped.append((char) iso.intValue());
  634. continue;
  635. }
  636. if (reference.length() == 0) {
  637. unescaped.append(EMPTY_REFERENCE);
  638. continue;
  639. }
  640. if (reference.charAt(0) == '#') {
  641. if (reference.length() > 2) {
  642. int index = 1;
  643. if (reference.charAt(1) == 'x' || reference.charAt(1) == 'X') {
  644. index = 2;
  645. }
  646. try {
  647. unescaped.append(
  648. (char) Integer.parseInt(
  649. reference.substring(index),
  650. (index == 1) ? 10 : 16));
  651. continue;
  652. }
  653. catch (NumberFormatException e) {
  654. // wasn't hex or decimal, copy original chars
  655. unescaped.append('&' + reference + ';');
  656. continue;
  657. }
  658. }
  659. unescaped.append(MALFORMED_REFERENCE);
  660. continue;
  661. }
  662. // may not be valid reference, forget it
  663. i = start - 1;
  664. }
  665. unescaped.append(c);
  666. }
  667. return unescaped.toString();
  668. }
  669. /**
  670. * Write the given character as decimal reference.
  671. * @param c the character to write
  672. * @param buf the buffer to write into
  673. */
  674. private static void writeDecimalReference(char c, StringBuffer buf) {
  675. buf.append(REFERENCE_START);
  676. buf.append((int) c);
  677. buf.append(';');
  678. }
  679. }