Escaper.php 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391
  1. <?php
  2. /**
  3. * @see https://github.com/laminas/laminas-escaper for the canonical source repository
  4. * @copyright https://github.com/laminas/laminas-escaper/blob/master/COPYRIGHT.md
  5. * @license https://github.com/laminas/laminas-escaper/blob/master/LICENSE.md New BSD License
  6. */
  7. namespace Laminas\Escaper;
  8. /**
  9. * Context specific methods for use in secure output escaping
  10. */
  11. class Escaper
  12. {
  13. /**
  14. * Entity Map mapping Unicode codepoints to any available named HTML entities.
  15. *
  16. * While HTML supports far more named entities, the lowest common denominator
  17. * has become HTML5's XML Serialisation which is restricted to the those named
  18. * entities that XML supports. Using HTML entities would result in this error:
  19. * XML Parsing Error: undefined entity
  20. *
  21. * @var array
  22. */
  23. protected static $htmlNamedEntityMap = [
  24. 34 => 'quot', // quotation mark
  25. 38 => 'amp', // ampersand
  26. 60 => 'lt', // less-than sign
  27. 62 => 'gt', // greater-than sign
  28. ];
  29. /**
  30. * Current encoding for escaping. If not UTF-8, we convert strings from this encoding
  31. * pre-escaping and back to this encoding post-escaping.
  32. *
  33. * @var string
  34. */
  35. protected $encoding = 'utf-8';
  36. /**
  37. * Holds the value of the special flags passed as second parameter to
  38. * htmlspecialchars().
  39. *
  40. * @var int
  41. */
  42. protected $htmlSpecialCharsFlags;
  43. /**
  44. * Static Matcher which escapes characters for HTML Attribute contexts
  45. *
  46. * @var callable
  47. */
  48. protected $htmlAttrMatcher;
  49. /**
  50. * Static Matcher which escapes characters for Javascript contexts
  51. *
  52. * @var callable
  53. */
  54. protected $jsMatcher;
  55. /**
  56. * Static Matcher which escapes characters for CSS Attribute contexts
  57. *
  58. * @var callable
  59. */
  60. protected $cssMatcher;
  61. /**
  62. * List of all encoding supported by this class
  63. *
  64. * @var array
  65. */
  66. protected $supportedEncodings = [
  67. 'iso-8859-1', 'iso8859-1', 'iso-8859-5', 'iso8859-5',
  68. 'iso-8859-15', 'iso8859-15', 'utf-8', 'cp866',
  69. 'ibm866', '866', 'cp1251', 'windows-1251',
  70. 'win-1251', '1251', 'cp1252', 'windows-1252',
  71. '1252', 'koi8-r', 'koi8-ru', 'koi8r',
  72. 'big5', '950', 'gb2312', '936',
  73. 'big5-hkscs', 'shift_jis', 'sjis', 'sjis-win',
  74. 'cp932', '932', 'euc-jp', 'eucjp',
  75. 'eucjp-win', 'macroman'
  76. ];
  77. /**
  78. * Constructor: Single parameter allows setting of global encoding for use by
  79. * the current object.
  80. *
  81. * @param string $encoding
  82. * @throws Exception\InvalidArgumentException
  83. */
  84. public function __construct($encoding = null)
  85. {
  86. if ($encoding !== null) {
  87. if (! is_string($encoding)) {
  88. throw new Exception\InvalidArgumentException(
  89. get_class($this) . ' constructor parameter must be a string, received ' . gettype($encoding)
  90. );
  91. }
  92. if ($encoding === '') {
  93. throw new Exception\InvalidArgumentException(
  94. get_class($this) . ' constructor parameter does not allow a blank value'
  95. );
  96. }
  97. $encoding = strtolower($encoding);
  98. if (! in_array($encoding, $this->supportedEncodings)) {
  99. throw new Exception\InvalidArgumentException(
  100. 'Value of \'' . $encoding . '\' passed to ' . get_class($this)
  101. . ' constructor parameter is invalid. Provide an encoding supported by htmlspecialchars()'
  102. );
  103. }
  104. $this->encoding = $encoding;
  105. }
  106. // We take advantage of ENT_SUBSTITUTE flag to correctly deal with invalid UTF-8 sequences.
  107. $this->htmlSpecialCharsFlags = ENT_QUOTES | ENT_SUBSTITUTE;
  108. // set matcher callbacks
  109. $this->htmlAttrMatcher = [$this, 'htmlAttrMatcher'];
  110. $this->jsMatcher = [$this, 'jsMatcher'];
  111. $this->cssMatcher = [$this, 'cssMatcher'];
  112. }
  113. /**
  114. * Return the encoding that all output/input is expected to be encoded in.
  115. *
  116. * @return string
  117. */
  118. public function getEncoding()
  119. {
  120. return $this->encoding;
  121. }
  122. /**
  123. * Escape a string for the HTML Body context where there are very few characters
  124. * of special meaning. Internally this will use htmlspecialchars().
  125. *
  126. * @param string $string
  127. * @return string
  128. */
  129. public function escapeHtml($string)
  130. {
  131. return htmlspecialchars($string, $this->htmlSpecialCharsFlags, $this->encoding);
  132. }
  133. /**
  134. * Escape a string for the HTML Attribute context. We use an extended set of characters
  135. * to escape that are not covered by htmlspecialchars() to cover cases where an attribute
  136. * might be unquoted or quoted illegally (e.g. backticks are valid quotes for IE).
  137. *
  138. * @param string $string
  139. * @return string
  140. */
  141. public function escapeHtmlAttr($string)
  142. {
  143. $string = $this->toUtf8($string);
  144. if ($string === '' || ctype_digit($string)) {
  145. return $string;
  146. }
  147. $result = preg_replace_callback('/[^a-z0-9,\.\-_]/iSu', $this->htmlAttrMatcher, $string);
  148. return $this->fromUtf8($result);
  149. }
  150. /**
  151. * Escape a string for the Javascript context. This does not use json_encode(). An extended
  152. * set of characters are escaped beyond ECMAScript's rules for Javascript literal string
  153. * escaping in order to prevent misinterpretation of Javascript as HTML leading to the
  154. * injection of special characters and entities. The escaping used should be tolerant
  155. * of cases where HTML escaping was not applied on top of Javascript escaping correctly.
  156. * Backslash escaping is not used as it still leaves the escaped character as-is and so
  157. * is not useful in a HTML context.
  158. *
  159. * @param string $string
  160. * @return string
  161. */
  162. public function escapeJs($string)
  163. {
  164. $string = $this->toUtf8($string);
  165. if ($string === '' || ctype_digit($string)) {
  166. return $string;
  167. }
  168. $result = preg_replace_callback('/[^a-z0-9,\._]/iSu', $this->jsMatcher, $string);
  169. return $this->fromUtf8($result);
  170. }
  171. /**
  172. * Escape a string for the URI or Parameter contexts. This should not be used to escape
  173. * an entire URI - only a subcomponent being inserted. The function is a simple proxy
  174. * to rawurlencode() which now implements RFC 3986 since PHP 5.3 completely.
  175. *
  176. * @param string $string
  177. * @return string
  178. */
  179. public function escapeUrl($string)
  180. {
  181. return rawurlencode($string);
  182. }
  183. /**
  184. * Escape a string for the CSS context. CSS escaping can be applied to any string being
  185. * inserted into CSS and escapes everything except alphanumerics.
  186. *
  187. * @param string $string
  188. * @return string
  189. */
  190. public function escapeCss($string)
  191. {
  192. $string = $this->toUtf8($string);
  193. if ($string === '' || ctype_digit($string)) {
  194. return $string;
  195. }
  196. $result = preg_replace_callback('/[^a-z0-9]/iSu', $this->cssMatcher, $string);
  197. return $this->fromUtf8($result);
  198. }
  199. /**
  200. * Callback function for preg_replace_callback that applies HTML Attribute
  201. * escaping to all matches.
  202. *
  203. * @param array $matches
  204. * @return string
  205. */
  206. protected function htmlAttrMatcher($matches)
  207. {
  208. $chr = $matches[0];
  209. $ord = ord($chr);
  210. /**
  211. * The following replaces characters undefined in HTML with the
  212. * hex entity for the Unicode replacement character.
  213. */
  214. if (($ord <= 0x1f && $chr != "\t" && $chr != "\n" && $chr != "\r")
  215. || ($ord >= 0x7f && $ord <= 0x9f)
  216. ) {
  217. return '&#xFFFD;';
  218. }
  219. /**
  220. * Check if the current character to escape has a name entity we should
  221. * replace it with while grabbing the integer value of the character.
  222. */
  223. if (strlen($chr) > 1) {
  224. $chr = $this->convertEncoding($chr, 'UTF-32BE', 'UTF-8');
  225. }
  226. $hex = bin2hex($chr);
  227. $ord = hexdec($hex);
  228. if (isset(static::$htmlNamedEntityMap[$ord])) {
  229. return '&' . static::$htmlNamedEntityMap[$ord] . ';';
  230. }
  231. /**
  232. * Per OWASP recommendations, we'll use upper hex entities
  233. * for any other characters where a named entity does not exist.
  234. */
  235. if ($ord > 255) {
  236. return sprintf('&#x%04X;', $ord);
  237. }
  238. return sprintf('&#x%02X;', $ord);
  239. }
  240. /**
  241. * Callback function for preg_replace_callback that applies Javascript
  242. * escaping to all matches.
  243. *
  244. * @param array $matches
  245. * @return string
  246. */
  247. protected function jsMatcher($matches)
  248. {
  249. $chr = $matches[0];
  250. if (strlen($chr) == 1) {
  251. return sprintf('\\x%02X', ord($chr));
  252. }
  253. $chr = $this->convertEncoding($chr, 'UTF-16BE', 'UTF-8');
  254. $hex = strtoupper(bin2hex($chr));
  255. if (strlen($hex) <= 4) {
  256. return sprintf('\\u%04s', $hex);
  257. }
  258. $highSurrogate = substr($hex, 0, 4);
  259. $lowSurrogate = substr($hex, 4, 4);
  260. return sprintf('\\u%04s\\u%04s', $highSurrogate, $lowSurrogate);
  261. }
  262. /**
  263. * Callback function for preg_replace_callback that applies CSS
  264. * escaping to all matches.
  265. *
  266. * @param array $matches
  267. * @return string
  268. */
  269. protected function cssMatcher($matches)
  270. {
  271. $chr = $matches[0];
  272. if (strlen($chr) == 1) {
  273. $ord = ord($chr);
  274. } else {
  275. $chr = $this->convertEncoding($chr, 'UTF-32BE', 'UTF-8');
  276. $ord = hexdec(bin2hex($chr));
  277. }
  278. return sprintf('\\%X ', $ord);
  279. }
  280. /**
  281. * Converts a string to UTF-8 from the base encoding. The base encoding is set via this
  282. * class' constructor.
  283. *
  284. * @param string $string
  285. * @throws Exception\RuntimeException
  286. * @return string
  287. */
  288. protected function toUtf8($string)
  289. {
  290. if ($this->getEncoding() === 'utf-8') {
  291. $result = $string;
  292. } else {
  293. $result = $this->convertEncoding($string, 'UTF-8', $this->getEncoding());
  294. }
  295. if (! $this->isUtf8($result)) {
  296. throw new Exception\RuntimeException(
  297. sprintf('String to be escaped was not valid UTF-8 or could not be converted: %s', $result)
  298. );
  299. }
  300. return $result;
  301. }
  302. /**
  303. * Converts a string from UTF-8 to the base encoding. The base encoding is set via this
  304. * class' constructor.
  305. * @param string $string
  306. * @return string
  307. */
  308. protected function fromUtf8($string)
  309. {
  310. if ($this->getEncoding() === 'utf-8') {
  311. return $string;
  312. }
  313. return $this->convertEncoding($string, $this->getEncoding(), 'UTF-8');
  314. }
  315. /**
  316. * Checks if a given string appears to be valid UTF-8 or not.
  317. *
  318. * @param string $string
  319. * @return bool
  320. */
  321. protected function isUtf8($string)
  322. {
  323. return ($string === '' || preg_match('/^./su', $string));
  324. }
  325. /**
  326. * Encoding conversion helper which wraps iconv and mbstring where they exist or throws
  327. * and exception where neither is available.
  328. *
  329. * @param string $string
  330. * @param string $to
  331. * @param array|string $from
  332. * @throws Exception\RuntimeException
  333. * @return string
  334. */
  335. protected function convertEncoding($string, $to, $from)
  336. {
  337. if (function_exists('iconv')) {
  338. $result = iconv($from, $to, $string);
  339. } elseif (function_exists('mb_convert_encoding')) {
  340. $result = mb_convert_encoding($string, $to, $from);
  341. } else {
  342. throw new Exception\RuntimeException(
  343. get_class($this)
  344. . ' requires either the iconv or mbstring extension to be installed'
  345. . ' when escaping for non UTF-8 strings.'
  346. );
  347. }
  348. if ($result === false) {
  349. return ''; // return non-fatal blank string on encoding errors from users
  350. }
  351. return $result;
  352. }
  353. }