3 3 | * SPDX-License-Identifier: Apache-2.0
|
4 4 | */
|
5 5 |
|
6 6 | use std::borrow::Cow;
|
7 7 | use std::fmt;
|
8 8 |
|
9 9 | #[derive(Debug, PartialEq, Eq)]
|
10 10 | enum EscapeErrorKind {
|
11 11 | ExpectedSurrogatePair(String),
|
12 12 | InvalidEscapeCharacter(char),
|
13 + | #[cfg(not(feature = "replace-invalid-utf8"))]
|
13 14 | InvalidSurrogatePair(u16, u16),
|
14 15 | InvalidUnicodeEscape(String),
|
15 16 | InvalidUtf8,
|
16 17 | UnexpectedEndOfString,
|
17 18 | }
|
18 19 |
|
19 20 | #[derive(Debug)]
|
20 21 | #[cfg_attr(test, derive(PartialEq, Eq))]
|
21 22 | pub struct EscapeError {
|
22 23 | kind: EscapeErrorKind,
|
23 24 | }
|
24 25 |
|
25 26 | impl std::error::Error for EscapeError {}
|
26 27 |
|
27 28 | impl fmt::Display for EscapeError {
|
28 29 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
29 30 | use EscapeErrorKind::*;
|
30 31 | match &self.kind {
|
31 32 | ExpectedSurrogatePair(low) => {
|
32 33 | write!(
|
33 34 | f,
|
34 35 | "expected a UTF-16 surrogate pair, but got {} as the low word",
|
35 36 | low
|
36 37 | )
|
37 38 | }
|
38 39 | InvalidEscapeCharacter(chr) => write!(f, "invalid JSON escape: \\{}", chr),
|
40 + | #[cfg(not(feature = "replace-invalid-utf8"))]
|
39 41 | InvalidSurrogatePair(high, low) => {
|
40 42 | write!(f, "invalid surrogate pair: \\u{:04X}\\u{:04X}", high, low)
|
41 43 | }
|
42 44 | InvalidUnicodeEscape(escape) => write!(f, "invalid JSON Unicode escape: \\u{}", escape),
|
43 45 | InvalidUtf8 => write!(f, "invalid UTF-8 codepoint in JSON string"),
|
44 46 | UnexpectedEndOfString => write!(f, "unexpected end of string"),
|
45 47 | }
|
46 48 | }
|
47 49 | }
|
48 50 |
|
175 177 | let codepoint_str =
|
176 178 | std::str::from_utf8(&rest[2..6]).map_err(|_| EscapeErrorKind::InvalidUtf8)?;
|
177 179 |
|
178 180 | // Error on characters `u16::from_str_radix` would otherwise accept, such as `+`
|
179 181 | if codepoint_str.bytes().any(|byte| !byte.is_ascii_hexdigit()) {
|
180 182 | return Err(EscapeErrorKind::InvalidUnicodeEscape(codepoint_str.into()).into());
|
181 183 | }
|
182 184 | Ok(u16::from_str_radix(codepoint_str, 16).expect("hex string is valid 16-bit value"))
|
183 185 | }
|
184 186 |
|
187 + | #[cfg(not(feature = "replace-invalid-utf8"))]
|
185 188 | /// Reads JSON Unicode escape sequences (i.e., "\u1234"). Will also read
|
186 189 | /// an additional codepoint if the first codepoint is the start of a surrogate pair.
|
187 190 | fn read_unicode_escapes(bytes: &[u8], into: &mut Vec<u8>) -> Result<usize, EscapeError> {
|
188 191 | let high = read_codepoint(bytes)?;
|
189 192 | let (bytes_read, chr) = if is_utf16_high_surrogate(high) {
|
190 193 | let low = read_codepoint(&bytes[6..])?;
|
191 194 | if !is_utf16_low_surrogate(low) {
|
192 195 | return Err(EscapeErrorKind::InvalidSurrogatePair(high, low).into());
|
193 196 | }
|
194 197 |
|
195 198 | let codepoint =
|
196 199 | std::char::from_u32(0x10000 + (high - 0xD800) as u32 * 0x400 + (low - 0xDC00) as u32)
|
197 200 | .ok_or(EscapeErrorKind::InvalidSurrogatePair(high, low))?;
|
198 201 | (12, codepoint)
|
199 202 | } else {
|
200 203 | let codepoint = std::char::from_u32(high as u32).ok_or_else(|| {
|
201 204 | EscapeErrorKind::InvalidUnicodeEscape(String::from_utf8_lossy(&bytes[0..6]).into())
|
202 205 | })?;
|
203 206 | (6, codepoint)
|
204 207 | };
|
205 208 |
|
206 209 | match chr.len_utf8() {
|
207 210 | 1 => into.push(chr as u8),
|
208 211 | _ => into.extend_from_slice(chr.encode_utf8(&mut [0; 4]).as_bytes()),
|
209 212 | }
|
210 213 | Ok(bytes_read)
|
211 214 | }
|
212 215 |
|
216 + | #[cfg(feature = "replace-invalid-utf8")]
|
217 + | fn read_unicode_escapes(bytes: &[u8], into: &mut Vec<u8>) -> Result<usize, EscapeError> {
|
218 + | let high = read_codepoint(bytes)?;
|
219 + | let (bytes_read, chr) = if is_utf16_high_surrogate(high) {
|
220 + | match read_codepoint(&bytes[6..]) {
|
221 + | Ok(low) if is_utf16_low_surrogate(low) => {
|
222 + | let codepoint = 0x10000 + (high - 0xD800) as u32 * 0x400 + (low - 0xDC00) as u32;
|
223 + | (12, std::char::from_u32(codepoint))
|
224 + | }
|
225 + | _ => (6, None),
|
226 + | }
|
227 + | } else {
|
228 + | (6, std::char::from_u32(high as u32))
|
229 + | };
|
230 + |
|
231 + | match chr {
|
232 + | Some(chr) => match chr.len_utf8() {
|
233 + | 1 => into.push(chr as u8),
|
234 + | _ => into.extend_from_slice(chr.encode_utf8(&mut [0; 4]).as_bytes()),
|
235 + | },
|
236 + | None => {
|
237 + | const REPLACEMENT_BYTES: &[u8] = "\u{FFFD}".as_bytes();
|
238 + | into.extend_from_slice(REPLACEMENT_BYTES)
|
239 + | }
|
240 + | }
|
241 + |
|
242 + | Ok(bytes_read)
|
243 + | }
|
244 + |
|
213 245 | #[cfg(test)]
|
214 246 | mod test {
|
215 247 | use super::escape_string;
|
216 248 | use crate::escape::{unescape_string, EscapeErrorKind};
|
217 249 | use std::borrow::Cow;
|
218 250 |
|
219 251 | #[test]
|
220 252 | fn escape() {
|
221 253 | assert_eq!("", escape_string("").as_ref());
|
222 254 | assert_eq!("foo", escape_string("foo").as_ref());
|
223 255 | assert_eq!("foo\\r\\n", escape_string("foo\r\n").as_ref());
|
224 256 | assert_eq!("foo\\r\\nbar", escape_string("foo\r\nbar").as_ref());
|
225 257 | assert_eq!(r"foo\\bar", escape_string(r"foo\bar").as_ref());
|
226 258 | assert_eq!(r"\\foobar", escape_string(r"\foobar").as_ref());
|
227 259 | assert_eq!(
|
228 260 | r"\bf\fo\to\r\n",
|
229 261 | escape_string("\u{08}f\u{0C}o\to\r\n").as_ref()
|
230 262 | );
|
231 263 | assert_eq!("\\\"test\\\"", escape_string("\"test\"").as_ref());
|
232 264 | assert_eq!("\\u0000", escape_string("\u{0}").as_ref());
|
233 265 | assert_eq!("\\u001f", escape_string("\u{1f}").as_ref());
|
234 266 | }
|
235 267 |
|
236 268 | #[test]
|
237 269 | fn unescape_no_escapes() {
|
238 270 | let unescaped = unescape_string("test test").unwrap();
|
239 271 | assert_eq!("test test", unescaped);
|
240 272 | assert!(matches!(unescaped, Cow::Borrowed(_)));
|
241 273 | }
|
242 274 |
|
275 + | #[cfg(not(feature = "replace-invalid-utf8"))]
|
243 276 | #[test]
|
244 277 | fn unescape() {
|
245 278 | assert_eq!(
|
246 279 | "\x08f\x0Co\to\r\n",
|
247 280 | unescape_string(r"\bf\fo\to\r\n").unwrap()
|
248 281 | );
|
249 282 | assert_eq!("\"test\"", unescape_string(r#"\"test\""#).unwrap());
|
250 283 | assert_eq!("\x00", unescape_string("\\u0000").unwrap());
|
251 284 | assert_eq!("\x1f", unescape_string("\\u001f").unwrap());
|
252 285 | assert_eq!("foo\r\nbar", unescape_string("foo\\r\\nbar").unwrap());
|
253 286 | assert_eq!("foo\r\n", unescape_string("foo\\r\\n").unwrap());
|
254 287 | assert_eq!("\r\nbar", unescape_string("\\r\\nbar").unwrap());
|
255 288 | assert_eq!("\u{10437}", unescape_string("\\uD801\\uDC37").unwrap());
|
256 289 |
|
257 290 | assert_eq!(
|
258 291 | Err(EscapeErrorKind::UnexpectedEndOfString.into()),
|
259 292 | unescape_string("\\")
|
260 293 | );
|
261 294 | assert_eq!(
|
262 295 | Err(EscapeErrorKind::UnexpectedEndOfString.into()),
|
263 296 | unescape_string("\\u")
|
264 297 | );
|
265 298 | assert_eq!(
|
266 299 | Err(EscapeErrorKind::UnexpectedEndOfString.into()),
|
267 300 | unescape_string("\\u00")
|
268 301 | );
|
269 302 | assert_eq!(
|
270 303 | Err(EscapeErrorKind::InvalidEscapeCharacter('z').into()),
|
271 304 | unescape_string("\\z")
|
272 305 | );
|
273 306 |
|
274 307 | assert_eq!(
|
275 308 | Err(EscapeErrorKind::ExpectedSurrogatePair("\\nasdf".into()).into()),
|
276 309 | unescape_string("\\uD801\\nasdf")
|
277 310 | );
|
278 311 | assert_eq!(
|
279 312 | Err(EscapeErrorKind::UnexpectedEndOfString.into()),
|
280 313 | unescape_string("\\uD801\\u00")
|
281 314 | );
|
282 315 | assert_eq!(
|
283 316 | Err(EscapeErrorKind::InvalidSurrogatePair(0xD801, 0xC501).into()),
|
284 317 | unescape_string("\\uD801\\uC501")
|
285 318 | );
|
286 319 |
|
287 320 | assert_eq!(
|
288 321 | Err(EscapeErrorKind::InvalidUnicodeEscape("+04D".into()).into()),
|
289 322 | unescape_string("\\u+04D")
|
290 323 | );
|
291 324 | }
|
292 325 |
|
326 + | #[cfg(feature = "replace-invalid-utf8")]
|
327 + | #[test]
|
328 + | fn unescape() {
|
329 + | assert_eq!(
|
330 + | "\x08f\x0Co\to\r\n",
|
331 + | unescape_string(r"\bf\fo\to\r\n").unwrap()
|
332 + | );
|
333 + | assert_eq!("\"test\"", unescape_string(r#"\"test\""#).unwrap());
|
334 + | assert_eq!("\x00", unescape_string("\\u0000").unwrap());
|
335 + | assert_eq!("\x1f", unescape_string("\\u001f").unwrap());
|
336 + | assert_eq!("foo\r\nbar", unescape_string("foo\\r\\nbar").unwrap());
|
337 + | assert_eq!("foo\r\n", unescape_string("foo\\r\\n").unwrap());
|
338 + | assert_eq!("\r\nbar", unescape_string("\\r\\nbar").unwrap());
|
339 + | assert_eq!("\u{10437}", unescape_string("\\uD801\\uDC37").unwrap());
|
340 + |
|
341 + | // New tests for invalid Unicode replacement
|
342 + | assert_eq!("�", unescape_string("\\uD800").unwrap()); // High surrogate without low surrogate
|
343 + | assert_eq!("�", unescape_string("\\uDC00").unwrap()); // Low surrogate without high surrogate
|
344 + | assert_eq!("��", unescape_string("\\uD800\\uD800").unwrap()); // Two high surrogates
|
345 + | assert_eq!("��", unescape_string("\\uDC00\\uDC00").unwrap()); // Two low surrogates
|
346 + | assert_eq!("test�test", unescape_string("test\\uD800test").unwrap()); // Orphaned surrogate in middle of string
|
347 + | assert_eq!(
|
348 + | "�\u{10437}",
|
349 + | unescape_string("\\uD800\\uD801\\uDC37").unwrap()
|
350 + | ); // Invalid then valid surrogate pair
|
351 + |
|
352 + | // These error cases should still work as before
|
353 + | assert_eq!(
|
354 + | Err(EscapeErrorKind::UnexpectedEndOfString.into()),
|
355 + | unescape_string("\\")
|
356 + | );
|
357 + | assert_eq!(
|
358 + | Err(EscapeErrorKind::UnexpectedEndOfString.into()),
|
359 + | unescape_string("\\u")
|
360 + | );
|
361 + | assert_eq!(
|
362 + | Err(EscapeErrorKind::UnexpectedEndOfString.into()),
|
363 + | unescape_string("\\u00")
|
364 + | );
|
365 + | assert_eq!(
|
366 + | Err(EscapeErrorKind::InvalidEscapeCharacter('z').into()),
|
367 + | unescape_string("\\z")
|
368 + | );
|
369 + | assert_eq!(
|
370 + | Err(EscapeErrorKind::InvalidUnicodeEscape("+04D".into()).into()),
|
371 + | unescape_string("\\u+04D")
|
372 + | );
|
373 + |
|
374 + | // Regular character.
|
375 + | assert_eq!("A", unescape_string("\\u0041").unwrap());
|
376 + |
|
377 + | // Single surrogates (should each become �).
|
378 + | assert_eq!("�", unescape_string("\\uD800").unwrap()); // High surrogate
|
379 + | assert_eq!("�", unescape_string("\\uDC00").unwrap()); // Low surrogate
|
380 + |
|
381 + | // Valid surrogate pair.
|
382 + | assert_eq!("🦀", unescape_string("\\uD83E\\uDD80").unwrap());
|
383 + |
|
384 + | // Invalid pairs (should each become ��).
|
385 + | assert_eq!("��", unescape_string("\\uD800\\uD801").unwrap()); // High + High
|
386 + | assert_eq!("��", unescape_string("\\uDC00\\uDC01").unwrap()); // Low + Low
|
387 + | assert_eq!("��", unescape_string("\\uDC00\\uD800").unwrap()); // Low + High
|
388 + |
|
389 + | // Surrogate + non-surrogate.
|
390 + | assert_eq!("�A", unescape_string("\\uD800\\u0041").unwrap()); // High + ASCII
|
391 + | assert_eq!("�A", unescape_string("\\uDC00\\u0041").unwrap()); // Low + ASCII
|
392 + | }
|
393 + |
|
293 394 | use proptest::proptest;
|
294 395 | proptest! {
|
295 396 | #[test]
|
296 397 | fn matches_serde_json(s in ".*") {
|
297 398 | let serde_escaped = serde_json::to_string(&s).unwrap();
|
298 399 | let serde_escaped = &serde_escaped[1..(serde_escaped.len() - 1)];
|
299 400 | assert_eq!(serde_escaped,escape_string(&s))
|
300 401 | }
|
301 402 |
|
302 403 | #[test]
|