xmlentnties and xml_entitiy_decode


/ Published in: PHP
Save to your folder(s)

Because htmlentities just doesn't cut it.

Recently needed this to work with 3rd part software, ended up having to make it pretty robust so thought I'd share!


Copy this code and paste it in your HTML
  1. <?php
  2. /**
  3.  * unicode_ord
  4.  *
  5.  * Returns the unicode value of the string
  6.  *
  7.  * @param string $c The source string
  8.  * @param integer $i The index to get the char from (passed by reference for use in a loop)
  9.  * @return integer The value of the char at $c[$i]
  10.  * @author kerry at shetline dot com
  11.  * @author Dom Hastings - modified to suit my needs
  12.  * @see http://www.php.net/manual/en/function.ord.php#78032
  13.  */
  14. function unicode_ord(&$c, &$i = 0) {
  15. // get the character length
  16. $l = strlen($c);
  17. // copy the offset
  18. $index = $i;
  19.  
  20. // check it's a valid offset
  21. if ($index >= $l) {
  22. return false;
  23. }
  24.  
  25. // check the value
  26. $o = ord($c[$index]);
  27.  
  28. // if it's ascii
  29. if ($o <= 0x7F) {
  30. return $o;
  31.  
  32. // not sure what it is...
  33. } elseif ($o < 0xC2) {
  34. return false;
  35.  
  36. // if it's a two-byte character
  37. } elseif ($o <= 0xDF && $index < $l - 1) {
  38. $i += 1;
  39. return ($o & 0x1F) << 6 | (ord($c[$index + 1]) & 0x3F);
  40.  
  41. // three-byte
  42. } elseif ($o <= 0xEF && $index < $l - 2) {
  43. $i += 2;
  44. return ($o & 0x0F) << 12 | (ord($c[$index + 1]) & 0x3F) << 6 | (ord($c[$index + 2]) & 0x3F);
  45.  
  46. // four-byte
  47. } elseif ($o <= 0xF4 && $index < $l - 3) {
  48. $i += 3;
  49. return ($o & 0x0F) << 18 | (ord($c[$index + 1]) & 0x3F) << 12 | (ord($c[$index + 2]) & 0x3F) << 6 | (ord($c[$index + 3]) & 0x3F);
  50.  
  51. // not sure what it is...
  52. } else {
  53. return false;
  54. }
  55. }
  56.  
  57. /**
  58.  * unicode_chr
  59.  *
  60.  * @param string $c
  61.  * @return string
  62.  * @author Miguel Perez
  63.  * @see http://www.php.net/manual/en/function.chr.php#77911
  64.  */
  65. function unicode_chr(&$c) {
  66. if ($c <= 0x7F) {
  67. return chr($c);
  68.  
  69. } else if ($c <= 0x7FF) {
  70. return chr(0xC0 | $c >> 6).chr(0x80 | $c & 0x3F);
  71.  
  72. } else if ($c <= 0xFFFF) {
  73. return chr(0xE0 | $c >> 12).chr(0x80 | $c >> 6 & 0x3F).chr(0x80 | $c & 0x3F);
  74.  
  75. } else if ($c <= 0x10FFFF) {
  76. return chr(0xF0 | $c >> 18) . chr(0x80 | $c >> 12 & 0x3F).chr(0x80 | $c >> 6 & 0x3F).chr(0x80 | $c & 0x3F);
  77.  
  78. } else {
  79. return false;
  80. }
  81. }
  82.  
  83. /**
  84.  * xmlentities
  85.  *
  86.  * Makes the specified string XML-safe
  87.  *
  88.  * @param string $s
  89.  * @param boolean $hex Whether or not to make hexadecimal entities (as opposed to decimal)
  90.  * @return string The XML-safe result
  91.  * @author Dom Hastings
  92.  * @dependencies unicode_ord()
  93.  * @see http://www.w3.org/TR/REC-xml/#sec-predefined-ent
  94.  */
  95. function xmlentities($s, $hex = true) {
  96. // if the string is empty
  97. if (empty($s)) {
  98. // just return it
  99. return $s;
  100. }
  101.  
  102. // create the return string
  103. $r = '';
  104. // get the length
  105. $l = strlen($s);
  106.  
  107. // iterate the string
  108. for ($i = 0; $i < $l; $i++) {
  109. // get the value of the character
  110. $o = unicode_ord($s, $i);
  111.  
  112. // valid cahracters
  113. $v = (
  114. // \t \n <vertical tab> <form feed> \r
  115. ($o >= 9 && $o <= 13) ||
  116. // <space> !
  117. ($o == 32) || ($o == 33) ||
  118. // # $ %
  119. ($o >= 35 && $o <= 37) ||
  120. // ( ) * + , - . /
  121. ($o >= 40 && $o <= 47) ||
  122. // numbers
  123. ($o >= 48 && $o <= 57) ||
  124. // : ;
  125. ($o == 58) || ($o == 59) ||
  126. // = ?
  127. ($o == 61) || ($o == 63) ||
  128. // @
  129. ($o == 64) ||
  130. // uppercase
  131. ($o >= 65 && $o <= 90) ||
  132. // [ \ ] ^ _ `
  133. ($o >= 91 && $o <= 96) ||
  134. // lowercase
  135. ($o >= 97 && $o <= 122) ||
  136. // { | } ~
  137. ($o >= 123 && $o <= 126)
  138. );
  139.  
  140. // if it's valid, just keep it
  141. if ($v) {
  142. $r .= $s[$i];
  143.  
  144. // &
  145. } elseif ($o == 38) {
  146. $r .= '&amp;';
  147.  
  148. // <
  149. } elseif ($o == 60) {
  150. $r .= '&lt;';
  151.  
  152. // >
  153. } elseif ($o == 62) {
  154. $r .= '&gt;';
  155.  
  156. // '
  157. } elseif ($o == 39) {
  158. $r .= '&apos;';
  159.  
  160. // "
  161. } elseif ($o == 34) {
  162. $r .= '&quot;';
  163.  
  164. // unknown, add it as a reference
  165. } elseif ($o > 0) {
  166. if ($hex) {
  167. $r .= '&#x'.strtoupper(dechex($o)).';';
  168.  
  169. } else {
  170. $r .= '&#'.$o.';';
  171. }
  172. }
  173. }
  174.  
  175. return $r;
  176. }
  177.  
  178. /**
  179.  * xmlentity_decode
  180.  *
  181.  * Converts XML entity encoded data back to a unicode string
  182.  *
  183.  * @param string $s The XML encoded string
  184.  * @param array $entities Additional entities to decode (optional)
  185.  * @return string
  186.  * @dependencies unicode_chr()
  187.  * @author Dom Hastings
  188.  */
  189. function xml_entity_decode($s, $entities = array()) {
  190. // if the string is empty, just return it
  191. if (empty($s)) {
  192. return $s;
  193. }
  194.  
  195. // check that entities is an array
  196. if (!is_array($entities)) {
  197. throw new Exception('xmlentity_decode expects argument 2 to be array.');
  198. }
  199.  
  200. // initialise vars
  201. $r = '';
  202. $l = strlen($s);
  203.  
  204. // merge the entities with the defaults (amp, lt, gt, apos and quot MUST take precedence)
  205. $entities = array_merge($entities, array(
  206. 'amp' => '&',
  207. 'lt' => '<',
  208. 'gt' => '>',
  209. 'apos' => '\'',
  210. 'quot' => '"'
  211. ));
  212.  
  213. // loop through the string
  214. for ($i = 0; $i < $l; $i++) {
  215. // if it looks like an entity
  216. if ($s[$i] == '&') {
  217. // initialise some vars
  218. $e = '';
  219. $c = '';
  220.  
  221. // loop until we find a semi-colon
  222. for ($j = ++$i; ($c != ';' && $j < $l); $j++) {
  223. // get the char
  224. $c = $s[$j];
  225.  
  226. // if it's not a semi-colon
  227. if ($c != ';') {
  228. // add it to the temporary entity string
  229. $e .= $c;
  230. }
  231. }
  232.  
  233. // update the index
  234. $i = ($j - 1);
  235.  
  236. // if the first char is a #, it's a numeric entity
  237. if ($e[0] == '#') {
  238. // if the second char is x it's a hexadecimal entity
  239. if ($e[1] == 'x') {
  240. // store the number
  241. $e = hexdec(substr($e, 2));
  242.  
  243. } else {
  244. // store the number
  245. $e = substr($e, 1);
  246. }
  247. }
  248.  
  249. // if we got a number
  250. if (is_numeric($e)) {
  251. // get the unicode char from it
  252. $r .= unicode_chr($e);
  253.  
  254. // otherwise
  255. } else {
  256. // if it's in our array (which it should be)
  257. if (array_key_exists($e, $entities)) {
  258. // append the character
  259. $r .= $entities[$e];
  260.  
  261. // otherwise
  262. } else {
  263. // throw an exception, we don't know what to do with this
  264. throw new Exception('Unknown entity "'.$e.'"');
  265. }
  266. }
  267.  
  268. // if it's just a regular char
  269. } else {
  270. // append it
  271. $r .= $s[$i];
  272. }
  273. }
  274.  
  275. return $r;
  276. }

URL: http://www.dom111.co.uk/blog/coding/xml-entities-in-php/224

Report this snippet


Comments

RSS Icon Subscribe to comments

You need to login to post a comment.