vstd/
utf8.rs

1//! Definitions for UTF-8 encoding and decoding of character sequences.
2//!
3//! [UTF-8](https://en.wikipedia.org/wiki/UTF-8) is a variable-width character encoding scheme.
4//! Each character is encoded with between 1 and 4 bytes.
5//! Specifications for encoding and decoding characters to their UTF-8 byte sequences are given by [`encode_utf8`] and [`decode_utf8`], respectively.
6//! Characters in the ASCII character set are encoded in UTF-8 with 1-byte encodings identical to those used by ASCII.
7//! Thus, some UTF-8 byte sequences can also be considered ASCII byte sequences, as defined in [`is_ascii_chars`].
8//!
9//! UTF-8 encodes numerical values called Unicode _scalars_ (see below), which assign a unique value to each Unicode character.
10//! A scalar value is encoded in UTF-8 using a leading byte and between 0 and 3 continuation bytes, where larger scalar values require more continuation bytes.
11//! The first part of the bit pattern in the leading byte is reserved for describing the number of bytes in the scalar's encoding (e.g., [`is_leading_byte_width_1`]).
12//! The rest of the leading byte contains data bits corresponding to the scalar's value (e.g., [`leading_bits_width_1`]).
13//! The continuation bytes also follow a specific bit pattern ([`is_continuation_byte`]) and contain the remainder of the data bits ([`continuation_bits`]).
14//!
15//! This module makes use of terminology from the [Unicode standard](https://www.unicode.org/glossary/).
16//! A Unicode _scalar_ is a numerical value (represented in this module as a `u32`) corresponding to a character that can be encoded in UTF-8.
17//! All Rust `char`s correspond to Unicode scalars ([`char_is_scalar`]),
18//! and every numerical value encoded in a UTF-8 byte sequence must fall within the range defined for Unicode scalars ([`is_scalar`]).
19//! The Unicode standard also defines a _codepoint_ to be a numerical value which falls in the range available for encoding characters in UTF-8.
20//! This may sound similar to the definition of scalar.
21//! However, the definition of codepoint is more permissive than that for scalars,
22//! as it includes some values which are technically possible to encode in the UTF-8 scheme,
23//! but in fact are not legal Unicode values
24//! (namely, the [high-surrogate and low-surrogate ranges](https://en.wikipedia.org/wiki/UTF-8#Surrogates)).
25//! To align with the Unicode terminology, in this module, we use the term "scalar" to describe the numerical values
26//! which can be encoded in valid UTF-8 byte sequences, and the term "codepoint" to describe numerical values which
27//! are learned upon decoding a byte sequence but may or may not be legal Unicode values.
28use super::prelude::*;
29use super::seq::*;
30
31verus! {
32
33broadcast use super::seq::group_seq_axioms;
34/* Decoding UTF-8 to chars */
35
36/// True when the given byte conforms to the bit pattern for the first byte of a 1-byte UTF-8 encoding of a single codepoint.
37/// The byte must have the form 0xxxxxxx.
38pub open spec fn is_leading_byte_width_1(byte: u8) -> bool {
39    0x00 <= byte <= 0x7f
40}
41
42/// True when the given byte conforms to the bit pattern for the first byte of a 2-byte UTF-8 encoding of a single codepoint.
43/// The byte must have the form 110xxxxx.
44pub open spec fn is_leading_byte_width_2(byte: u8) -> bool {
45    0xc0 <= byte <= 0xdf
46}
47
48/// True when the given byte conforms to the bit pattern for the first byte of a 3-byte UTF-8 encoding of a single codepoint.
49/// The byte must have the form 1110xxxx.
50pub open spec fn is_leading_byte_width_3(byte: u8) -> bool {
51    0xe0 <= byte <= 0xef
52}
53
54/// True when the given byte conforms to the bit pattern for the first byte of a 4-byte UTF-8 encoding of a single codepoint.
55/// The byte must have the form 11110xxx.
56pub open spec fn is_leading_byte_width_4(byte: u8) -> bool {
57    0xf0 <= byte <= 0xf7
58}
59
60/// True when the given byte conforms to the bit pattern for a continuation byte of a UTF-8 encoding of a single codepoint.
61/// The byte must have the form 10xxxxxx.
62pub open spec fn is_continuation_byte(byte: u8) -> bool {
63    0x80 <= byte <= 0xbf
64}
65
66/// Value of the 6 data bits from the given continuation byte, assuming that it is a valid continuation byte for a UTF-8 encoding.
67pub open spec fn continuation_bits(byte: u8) -> u32
68    recommends
69        is_continuation_byte(byte),
70{
71    // 0x3f = 0011 1111
72    (byte & 0x3f) as u32
73}
74
75/// Value of the 7 data bits from the given byte, assuming that it is a valid leading byte for a 1-byte UTF-8 encoding.
76pub open spec fn leading_bits_width_1(byte: u8) -> u32
77    recommends
78        is_leading_byte_width_1(byte),
79{
80    // 0x7f = 0111 1111
81    (byte & 0x7F) as u32
82}
83
84/// Value of the 5 data bits from the given byte, assuming that it is a valid leading byte for a 2-byte UTF-8 encoding.
85pub open spec fn leading_bits_width_2(byte: u8) -> u32
86    recommends
87        is_leading_byte_width_2(byte),
88{
89    // 0x1f = 0001 1111
90    (byte & 0x1F) as u32
91}
92
93/// Value of the 4 data bits from the given byte, assuming that it is a valid leading byte for a 3-byte UTF-8 encoding.
94pub open spec fn leading_bits_width_3(byte: u8) -> u32
95    recommends
96        is_leading_byte_width_3(byte),
97{
98    // 0x0f = 0000 1111
99    (byte & 0x0F) as u32
100}
101
102/// Value of the 3 data bits from the given byte, assuming that it is a valid leading byte for a 4-byte UTF-8 encoding.
103pub open spec fn leading_bits_width_4(byte: u8) -> u32
104    recommends
105        is_leading_byte_width_4(byte),
106{
107    // 0x07 = 0000 0111
108    (byte & 0x07) as u32
109}
110
111/// The codepoint encoded by the given byte, assuming that it is a valid leading byte for a 1-byte UTF-8 encoding.
112pub open spec fn codepoint_width_1(byte1: u8) -> u32
113    recommends
114        is_leading_byte_width_1(byte1),
115{
116    leading_bits_width_1(byte1)
117}
118
119// 0xc1 = 1100 0001
120// 0xc1 & 0x1f = 0x01
121// 0x01 << 6 = 0100 0000 = 0x40
122// If byte2 = 0xff then byte2 & 0x3f = 0x3f = 0011 1111
123// so highest possible is 0111 1111 = 0x7f < 0x80
124/// The codepoint encoded by the given 2 bytes, assuming that they are a valid leading and continuation byte, respectively, for 2-byte UTF-8 encoding.
125pub open spec fn codepoint_width_2(byte1: u8, byte2: u8) -> u32
126    recommends
127        is_leading_byte_width_2(byte1),
128        is_continuation_byte(byte2),
129{
130    (leading_bits_width_2(byte1) << 6) | continuation_bits(byte2)
131}
132
133/// The codepoint encoded by the given 3 bytes, assuming that they are a valid leading and continuation bytes, respectively, for 3-byte UTF-8 encoding.
134pub open spec fn codepoint_width_3(byte1: u8, byte2: u8, byte3: u8) -> u32
135    recommends
136        is_leading_byte_width_3(byte1),
137        is_continuation_byte(byte2),
138        is_continuation_byte(byte3),
139{
140    (leading_bits_width_3(byte1) << 12) | (continuation_bits(byte2) << 6) | continuation_bits(byte3)
141}
142
143// 0xf7 = 1111 0111
144// 0xf7 & 0x07 = 0x07
145// 0x07 << 18 = 0001 1100 0000 0000 0000 0000 = 0x1c0000
146// 0xf5 = 1111 0101
147// 0xf5 & 0x07 = 0x05
148// 0x05 << 18 = 0001 0100 0000 0000 0000 0000 = 0x140000
149/// The codepoint encoded by the given 4 bytes, assuming that they are a valid leading and continuation bytes, respectively, for 4-byte UTF-8 encoding.
150pub open spec fn codepoint_width_4(byte1: u8, byte2: u8, byte3: u8, byte4: u8) -> u32
151    recommends
152        is_leading_byte_width_4(byte1),
153        is_continuation_byte(byte2),
154        is_continuation_byte(byte3),
155        is_continuation_byte(byte4),
156{
157    (leading_bits_width_4(byte1) << 18) | (continuation_bits(byte2) << 12) | (continuation_bits(
158        byte3,
159    ) << 6) | continuation_bits(byte4)
160}
161
162/// True when the given byte sequence begins with a well-formed leading byte and an appropriate number of well-formed continuation bytes for a UTF-8 encoding of a single codepoint.
163pub open spec fn valid_leading_and_continuation_bytes_first_codepoint(bytes: Seq<u8>) -> bool {
164    ||| (bytes.len() >= 1 && is_leading_byte_width_1(bytes[0]))
165    ||| (bytes.len() >= 2 && is_leading_byte_width_2(bytes[0]) && is_continuation_byte(bytes[1]))
166    ||| (bytes.len() >= 3 && is_leading_byte_width_3(bytes[0]) && is_continuation_byte(bytes[1])
167        && is_continuation_byte(bytes[2]))
168    ||| (bytes.len() >= 4 && is_leading_byte_width_4(bytes[0]) && is_continuation_byte(bytes[1])
169        && is_continuation_byte(bytes[2]) && is_continuation_byte(bytes[3]))
170}
171
172/// Returns the first codepoint encoded in UTF-8 in the given byte sequence, assuming that the sequence begins with a well-formed leading byte and an appropriate number of well-formed continuation bytes.
173pub open spec fn decode_first_codepoint(bytes: Seq<u8>) -> u32
174    recommends
175        valid_leading_and_continuation_bytes_first_codepoint(bytes),
176{
177    if is_leading_byte_width_1(bytes[0]) {
178        codepoint_width_1(bytes[0])
179    } else if is_leading_byte_width_2(bytes[0]) {
180        codepoint_width_2(bytes[0], bytes[1])
181    } else if is_leading_byte_width_3(bytes[0]) {
182        codepoint_width_3(bytes[0], bytes[1], bytes[2])
183    } else {
184        codepoint_width_4(bytes[0], bytes[1], bytes[2], bytes[3])
185    }
186}
187
188/// The length in bytes of the first codepoint encoded in UTF-8 in the given byte sequence, assuming that the sequence begins with a well-formed leading byte and an appropriate number of well-formed continuation bytes.
189pub open spec fn length_of_first_codepoint(bytes: Seq<u8>) -> int
190    recommends
191        valid_leading_and_continuation_bytes_first_codepoint(bytes),
192{
193    if is_leading_byte_width_1(bytes[0]) {
194        1
195    } else if is_leading_byte_width_2(bytes[0]) {
196        2
197    } else if is_leading_byte_width_3(bytes[0]) {
198        3
199    } else {
200        4
201    }
202}
203
204/// True when the given codepoint, when encoded in UTF-8 using `len` number of bytes, would not be an "overlong encoding".
205/// An overlong encoding is one that uses more bytes than needed to encode the given value.
206pub open spec fn not_overlong_encoding(codepoint: u32, len: int) -> bool {
207    &&& (len == 2 ==> 0x80 <= codepoint)
208    &&& (len == 3 ==> 0x800 <= codepoint)
209    &&& (len == 4 ==> 0x10000 <= codepoint <= 0x10ffff)
210}
211
212/// True when the given codepoint does not fall into the "surrogate range" of the Unicode standard.
213/// The surrogate range contains values which are technically possible to encode in UTF-8 but are not valid Unicode scalars.
214pub open spec fn not_surrogate(codepoint: u32) -> bool {
215    !(0xD800 <= codepoint <= 0xDFFF)
216}
217
218/// True when the given byte sequence begins with a well-formed UTF-8 encoding of a single scalar.
219/// To be a well-formed encoding, the bytes must: follow the expected bit pattern for leading and continuation bytes
220/// for a single scalar encoding, not be an "overlong encoding", and not fall in the surrogate range.
221pub open spec fn valid_first_scalar(bytes: Seq<u8>) -> bool {
222    &&& valid_leading_and_continuation_bytes_first_codepoint(bytes)
223    &&& not_overlong_encoding(decode_first_codepoint(bytes), length_of_first_codepoint(bytes))
224    &&& not_surrogate(decode_first_codepoint(bytes))
225}
226
227/// The first scalar encoded in UTF-8 in the given byte sequence, assuming that the sequence begins with a well-formed encoding of a single scalar.
228pub open spec fn decode_first_scalar(bytes: Seq<u8>) -> u32
229    recommends
230        valid_first_scalar(bytes),
231{
232    decode_first_codepoint(bytes)
233}
234
235/// The length in bytes of first scalar encoded in UTF-8 in the given byte sequence, assuming that the sequence begins with a well-formed encoding of a single scalar.
236pub open spec fn length_of_first_scalar(bytes: Seq<u8>) -> int
237    recommends
238        valid_first_scalar(bytes),
239{
240    length_of_first_codepoint(bytes)
241}
242
243/// Removes the first scalar encoded in UTF-8 in the given byte sequence and returns the rest of the sequence, assuming that the sequence begins with a well-formed encoding of a single scalar.
244pub open spec fn pop_first_scalar(bytes: Seq<u8>) -> Seq<u8>
245    recommends
246        valid_first_scalar(bytes),
247{
248    bytes.subrange(length_of_first_scalar(bytes), bytes.len() as int)
249}
250
251proof fn lemma_pop_first_scalar_decreases(bytes: Seq<u8>)
252    requires
253        valid_first_scalar(bytes),
254    ensures
255        pop_first_scalar(bytes).len() < bytes.len(),
256{
257    assert(length_of_first_scalar(bytes) <= bytes.len() as int);
258    assert(pop_first_scalar(bytes).len() == bytes.len() as int - length_of_first_scalar(bytes)) by {
259        axiom_seq_subrange_len(bytes, length_of_first_scalar(bytes), bytes.len() as int)
260    };
261}
262
263/// Takes the bytes corresponding to the first scalar encoded in UTF-8 in the given byte sequence, assuming that the sequence begins with a well-formed encoding of a single scalar.
264pub open spec fn take_first_scalar(bytes: Seq<u8>) -> Seq<u8>
265    recommends
266        valid_first_scalar(bytes),
267{
268    bytes.subrange(0, length_of_first_scalar(bytes))
269}
270
271/// True when the given bytes form a valid UTF-8 encoding.
272pub open spec fn valid_utf8(bytes: Seq<u8>) -> bool
273    decreases bytes.len(),
274{
275    bytes.len() != 0 ==> valid_first_scalar(bytes) && valid_utf8(pop_first_scalar(bytes))
276}
277
278/// The sequence of characters encoded as Unicode scalars in the given bytes, assuming that the bytes form a valid UTF-8 encoding.
279pub open spec fn decode_utf8(bytes: Seq<u8>) -> Seq<char>
280    recommends
281        valid_utf8(bytes),
282    decreases bytes.len(),
283    when valid_utf8(bytes)
284{
285    if bytes.len() == 0 {
286        seq![]
287    } else {
288        seq![decode_first_scalar(bytes) as char] + decode_utf8(pop_first_scalar(bytes))
289    }
290}
291
292/// The length in bytes of the last scalar encoded in UTF-8 in the given byte sequence, assuming that the bytes form a valid UTF-8 encoding.
293pub open spec fn length_of_last_scalar(bytes: Seq<u8>) -> int
294    recommends
295        valid_utf8(bytes),
296        bytes.len() > 0,
297{
298    let n = bytes.len() as int;
299    if !is_continuation_byte(bytes[n - 1]) {
300        1
301    } else if !is_continuation_byte(bytes[n - 2]) {
302        2
303    } else if !is_continuation_byte(bytes[n - 3]) {
304        3
305    } else {
306        4
307    }
308}
309
310/// Takes the bytes corresponding to the last scalar encoded in UTF-8 in the given byte sequence, assuming that the bytes form a valid UTF-8 encoding.
311pub open spec fn take_last_scalar(bytes: Seq<u8>) -> Seq<u8>
312    recommends
313        valid_utf8(bytes),
314        bytes.len() > 0,
315{
316    let len = length_of_last_scalar(bytes);
317    bytes.subrange(bytes.len() - len, bytes.len() as int)
318}
319
320/// The last scalar encoded in UTF-8 in the given byte sequence, assuming that the bytes form a valid UTF-8 encoding.
321pub open spec fn decode_last_scalar(bytes: Seq<u8>) -> u32
322    recommends
323        valid_utf8(bytes),
324        bytes.len() > 0,
325{
326    let n = bytes.len() as int;
327    if !is_continuation_byte(bytes[n - 1]) {
328        codepoint_width_1(bytes[n - 1])
329    } else if !is_continuation_byte(bytes[n - 2]) {
330        codepoint_width_2(bytes[n - 2], bytes[n - 1])
331    } else if !is_continuation_byte(bytes[n - 3]) {
332        codepoint_width_3(bytes[n - 3], bytes[n - 2], bytes[n - 1])
333    } else {
334        codepoint_width_4(bytes[n - 4], bytes[n - 3], bytes[n - 2], bytes[n - 1])
335    }
336}
337
338/* Encoding chars as UTF-8 */
339
340/// True when the given value is a Unicode scalar with a 1-byte UTF-8 encoding.
341pub open spec fn has_width_1_encoding(v: u32) -> bool {
342    0 <= v <= 0x7F
343}
344
345/// True when the given value is a Unicode scalar with a 2-byte UTF-8 encoding.
346pub open spec fn has_width_2_encoding(v: u32) -> bool {
347    0x80 <= v <= 0x7FF
348}
349
350/// True when the given value is a Unicode scalar with a 3-byte UTF-8 encoding.
351pub open spec fn has_width_3_encoding(v: u32) -> bool {
352    0x800 <= v <= 0xFFFF && !(0xD800 <= v <= 0xDFFF)
353}
354
355/// True when the given value is a Unicode scalar with a 4-byte UTF-8 encoding.
356pub open spec fn has_width_4_encoding(v: u32) -> bool {
357    0x10000 <= v <= 0x10FFFF
358}
359
360/// True when the given `u32` represents a Unicode scalar, i.e., a value that can be encoded in UTF-8.
361/// This definition is equivalent to: `0 <= v <= 0x10ffff && !(0xD800 <= v <= 0xDFFF)`.
362pub open spec fn is_scalar(v: u32) -> bool {
363    ||| has_width_1_encoding(v)
364    ||| has_width_2_encoding(v)
365    ||| has_width_3_encoding(v)
366    ||| has_width_4_encoding(v)
367}
368
369/// The first (and only) byte of the UTF-8 encoding of the given scalar value, assuming that the scalar has a 1-byte UTF-8 encoding.
370pub open spec fn leading_byte_width_1(scalar: u32) -> u8
371    recommends
372        has_width_1_encoding(scalar),
373{
374    (scalar & 0x7F) as u8
375}
376
377/// The first byte of the UTF-8 encoding of the given scalar value, assuming that the scalar has a 2-byte UTF-8 encoding.
378pub open spec fn leading_byte_width_2(scalar: u32) -> u8
379    recommends
380        has_width_2_encoding(scalar),
381{
382    0xC0 | ((scalar >> 6) & 0x1F) as u8
383}
384
385/// The first byte of the UTF-8 encoding of the given scalar value, assuming that the scalar has a 3-byte UTF-8 encoding.
386pub open spec fn leading_byte_width_3(scalar: u32) -> u8
387    recommends
388        has_width_3_encoding(scalar),
389{
390    0xE0 | ((scalar >> 12) & 0x0F) as u8
391}
392
393/// The first byte of the UTF-8 encoding of the given scalar value, assuming that the scalar has a 4-byte UTF-8 encoding.
394pub open spec fn leading_byte_width_4(scalar: u32) -> u8
395    recommends
396        has_width_4_encoding(scalar),
397{
398    0xF0 | ((scalar >> 18) & 0x7) as u8
399}
400
401/// The last continuation byte of the UTF-8 encoding of the given scalar value, assuming that the scalar has a 2, 3, or 4-byte UTF-8 encoding.
402pub open spec fn last_continuation_byte(scalar: u32) -> u8
403    recommends
404        has_width_2_encoding(scalar) || has_width_3_encoding(scalar) || has_width_4_encoding(
405            scalar,
406        ),
407{
408    0x80 | (scalar & 0x3F) as u8
409}
410
411/// The second-to-last continuation byte of the UTF-8 encoding of the given scalar value, assuming that the scalar has a 3 or 4-byte UTF-8 encoding.
412pub open spec fn second_last_continuation_byte(scalar: u32) -> u8
413    recommends
414        has_width_3_encoding(scalar) || has_width_4_encoding(scalar),
415{
416    0x80 | ((scalar >> 6) & 0x3F) as u8
417}
418
419/// The third-to-last continuation byte of the UTF-8 encoding of the given scalar value, assuming that the scalar has a 4-byte UTF-8 encoding.
420pub open spec fn third_last_continuation_byte(scalar: u32) -> u8
421    recommends
422        has_width_4_encoding(scalar),
423{
424    0x80 | ((scalar >> 12) & 0x3F) as u8
425}
426
427/// The UTF-8 encoding of the given value, assuming that it is a Unicode scalar.
428pub open spec fn encode_scalar(scalar: u32) -> Seq<u8>
429    recommends
430        is_scalar(scalar),
431{
432    if has_width_1_encoding(scalar) {
433        seq![leading_byte_width_1(scalar)]
434    } else if has_width_2_encoding(scalar) {
435        seq![leading_byte_width_2(scalar), last_continuation_byte(scalar)]
436    } else if has_width_3_encoding(scalar) {
437        seq![
438            leading_byte_width_3(scalar),
439            second_last_continuation_byte(scalar),
440            last_continuation_byte(scalar),
441        ]
442    } else {
443        seq![
444            leading_byte_width_4(scalar),
445            third_last_continuation_byte(scalar),
446            second_last_continuation_byte(scalar),
447            last_continuation_byte(scalar),
448        ]
449    }
450}
451
452/// The UTF-8 encoding of the given `char` sequence.
453pub open spec fn encode_utf8(chars: Seq<char>) -> Seq<u8>
454    decreases chars.len(),
455{
456    if chars.len() == 0 {
457        seq![]
458    } else {
459        encode_scalar(chars[0] as u32) + encode_utf8(chars.drop_first())
460    }
461}
462
463/* Correspondence between encode_utf8 and decode_utf8 definitions */
464
465// Performing encode followed by decode on a scalar with a 1-byte UTF-8 encoding results in the same value.
466proof fn encode_decode_width_1(c: u32)
467    by (bit_vector)
468    requires
469        has_width_1_encoding(c),
470    ensures
471        ({
472            let b1 = leading_byte_width_1(c);
473            &&& is_leading_byte_width_1(b1)
474            &&& codepoint_width_1(b1) == c
475        }),
476{
477}
478
479// Performing decode followed by encode on a 1-byte UTF-8 encoding results in the same byte.
480proof fn decode_encode_width_1(b1: u8)
481    by (bit_vector)
482    requires
483        is_leading_byte_width_1(b1),
484    ensures
485        ({
486            let c = codepoint_width_1(b1);
487            &&& has_width_1_encoding(c)
488            &&& leading_byte_width_1(c) == b1
489        }),
490{
491}
492
493// Performing encode followed by decode on a scalar with a 2-byte UTF-8 encoding  results in the same value.
494proof fn encode_decode_width_2(c: u32)
495    by (bit_vector)
496    requires
497        has_width_2_encoding(c),
498    ensures
499        ({
500            let b1 = leading_byte_width_2(c);
501            let b2 = last_continuation_byte(c);
502            &&& is_leading_byte_width_2(b1)
503            &&& is_continuation_byte(b2)
504            &&& codepoint_width_2(b1, b2) == c
505        }),
506{
507}
508
509// Performing decode followed by encode on a 2-byte UTF-8 encoding results in the same bytes.
510proof fn decode_encode_width_2(b1: u8, b2: u8)
511    by (bit_vector)
512    requires
513        is_leading_byte_width_2(b1),
514        is_continuation_byte(b2),
515        not_overlong_encoding(codepoint_width_2(b1, b2), 2),
516    ensures
517        ({
518            let c = codepoint_width_2(b1, b2);
519            &&& has_width_2_encoding(c)
520            &&& leading_byte_width_2(c) == b1
521            &&& last_continuation_byte(c) == b2
522        }),
523{
524}
525
526// Performing encode followed by decode on a scalar with a 3-byte UTF-8 encoding  results in the same value.
527proof fn encode_decode_width_3(c: u32)
528    by (bit_vector)
529    requires
530        has_width_3_encoding(c),
531    ensures
532        ({
533            let b1 = leading_byte_width_3(c);
534            let b2 = second_last_continuation_byte(c);
535            let b3 = last_continuation_byte(c);
536            &&& is_leading_byte_width_3(b1)
537            &&& is_continuation_byte(b2)
538            &&& is_continuation_byte(b3)
539            &&& codepoint_width_3(b1, b2, b3) == c
540        }),
541{
542}
543
544// Performing decode followed by encode on a 3-byte UTF-8 encoding results in the same bytes.
545proof fn decode_encode_width_3(b1: u8, b2: u8, b3: u8)
546    by (bit_vector)
547    requires
548        is_leading_byte_width_3(b1),
549        is_continuation_byte(b2),
550        is_continuation_byte(b3),
551        not_overlong_encoding(codepoint_width_3(b1, b2, b3), 3),
552        not_surrogate(codepoint_width_3(b1, b2, b3)),
553    ensures
554        ({
555            let c = codepoint_width_3(b1, b2, b3);
556            &&& has_width_3_encoding(c)
557            &&& leading_byte_width_3(c) == b1
558            &&& second_last_continuation_byte(c) == b2
559            &&& last_continuation_byte(c) == b3
560        }),
561{
562}
563
564// Performing encode followed by decode on a scalar with a 4-byte UTF-8 encoding results in the same value.
565proof fn encode_decode_width_4(c: u32)
566    by (bit_vector)
567    requires
568        has_width_4_encoding(c),
569    ensures
570        ({
571            let b1 = leading_byte_width_4(c);
572            let b2 = third_last_continuation_byte(c);
573            let b3 = second_last_continuation_byte(c);
574            let b4 = last_continuation_byte(c);
575            &&& is_leading_byte_width_4(b1)
576            &&& is_continuation_byte(b2)
577            &&& is_continuation_byte(b3)
578            &&& is_continuation_byte(b4)
579            &&& codepoint_width_4(b1, b2, b3, b4) == c
580        }),
581{
582}
583
584// Performing decode followed by encode on a 4-byte UTF-8 encoding results in the same bytes.
585proof fn decode_encode_width_4(b1: u8, b2: u8, b3: u8, b4: u8)
586    by (bit_vector)
587    requires
588        is_leading_byte_width_4(b1),
589        is_continuation_byte(b2),
590        is_continuation_byte(b3),
591        is_continuation_byte(b4),
592        not_overlong_encoding(codepoint_width_4(b1, b2, b3, b4), 4),
593    ensures
594        ({
595            let c = codepoint_width_4(b1, b2, b3, b4);
596            &&& has_width_4_encoding(c)
597            &&& leading_byte_width_4(c) == b1
598            &&& third_last_continuation_byte(c) == b2
599            &&& second_last_continuation_byte(c) == b3
600            &&& last_continuation_byte(c) == b4
601        }),
602{
603}
604
605/// A `char` always represents a Unicode scalar value.
606pub broadcast proof fn char_is_scalar(c: char)
607    ensures
608        is_scalar(#[trigger] (c as u32)),
609{
610}
611
612/// Ensures that a `char`, when cast to a `u32`, can be cast back to a `char`.
613pub broadcast proof fn char_u32_cast(c: char, u: u32)
614    requires
615        u == #[trigger] (c as u32),
616    ensures
617        #[trigger] (u as char) == c,
618{
619}
620
621/// Properties of the first scalar from the result of [`encode_utf8`].
622pub proof fn encode_utf8_first_scalar(chars: Seq<char>)
623    requires
624        chars.len() > 0,
625    ensures
626        decode_first_scalar(encode_utf8(chars)) == chars[0] as u32,
627        length_of_first_scalar(encode_utf8(chars)) == encode_scalar(chars[0] as u32).len(),
628        valid_first_scalar(encode_utf8(chars)),
629{
630    char_is_scalar(chars[0]);
631    let s = chars[0] as u32;
632    if has_width_1_encoding(s) {
633        encode_decode_width_1(s);
634    } else if has_width_2_encoding(s) {
635        encode_decode_width_2(s);
636    } else if has_width_3_encoding(s) {
637        encode_decode_width_3(s);
638    } else {
639        encode_decode_width_4(s);
640    }
641}
642
643/// Ensures the result of [`encode_utf8`] always satisfies [`valid_utf8`].
644pub broadcast proof fn encode_utf8_valid_utf8(chars: Seq<char>)
645    ensures
646        valid_utf8(#[trigger] encode_utf8(chars)),
647    decreases chars.len(),
648{
649    if chars.len() == 0 {
650    } else {
651        let bytes = encode_utf8(chars);
652        encode_utf8_first_scalar(chars);
653        assert(pop_first_scalar(bytes) =~= encode_utf8(chars.drop_first()));
654        encode_utf8_valid_utf8(chars.drop_first());
655    }
656}
657
658/// Ensures that performing [`encode_utf8`] followed by [`decode_utf8`] results in the original `char` sequence.
659pub broadcast proof fn encode_utf8_decode_utf8(chars: Seq<char>)
660    ensures
661        #[trigger] decode_utf8(encode_utf8(chars)) == chars,
662    decreases chars.len(),
663{
664    broadcast use encode_utf8_valid_utf8;
665
666    if chars.len() == 0 {
667    } else {
668        let bytes = encode_utf8(chars);
669        encode_utf8_first_scalar(chars);
670        char_u32_cast(chars[0], decode_first_scalar(bytes));
671
672        assert(pop_first_scalar(bytes) =~= encode_utf8(chars.drop_first()));
673        let rest = chars.drop_first();
674        encode_utf8_decode_utf8(rest);
675    }
676}
677
678/// Properties of the first scalar from the result of [`decode_utf8`].
679pub proof fn decode_utf8_first_scalar(bytes: Seq<u8>)
680    requires
681        valid_utf8(bytes),
682        bytes.len() > 0,
683    ensures
684        encode_scalar((decode_first_scalar(bytes) as char) as u32) == take_first_scalar(bytes),
685{
686    if is_leading_byte_width_1(bytes[0]) {
687        decode_encode_width_1(bytes[0]);
688    } else if is_leading_byte_width_2(bytes[0]) {
689        decode_encode_width_2(bytes[0], bytes[1]);
690    } else if is_leading_byte_width_3(bytes[0]) {
691        decode_encode_width_3(bytes[0], bytes[1], bytes[2]);
692    } else {
693        decode_encode_width_4(bytes[0], bytes[1], bytes[2], bytes[3]);
694    }
695}
696
697/// Ensures that performing [`decode_utf8`] followed by [`encode_utf8`] results in the original byte sequence.
698pub broadcast proof fn decode_utf8_encode_utf8(bytes: Seq<u8>)
699    requires
700        valid_utf8(bytes),
701    ensures
702        #[trigger] encode_utf8(decode_utf8(bytes)) == bytes,
703    decreases bytes.len(),
704{
705    broadcast use encode_utf8_valid_utf8;
706
707    if bytes.len() == 0 {
708    } else {
709        let chars = decode_utf8(bytes);
710        let first = decode_first_scalar(bytes) as char;
711        let rest = pop_first_scalar(bytes);
712
713        char_is_scalar(first);
714        assert(encode_scalar(first as u32) == take_first_scalar(bytes)) by {
715            decode_utf8_first_scalar(bytes);
716        }
717
718        assert(chars.drop_first() =~= decode_utf8(rest));
719        decode_utf8_encode_utf8(rest);
720    }
721}
722
723/* Partial UTF-8 sequences */
724
725/// True when the first `i` bytes in the given sequence represent a valid UTF-8 encoding.
726pub open spec fn partial_valid_utf8(bytes: Seq<u8>, i: int) -> bool {
727    0 <= i <= bytes.len() && valid_utf8(bytes.subrange(0, i))
728}
729
730/// Ensures that a byte sequence is not a valid UTF-8 byte sequence when it has a suffix that is not a valid UTF-8 byte sequence.
731pub proof fn partial_valid_partial_invalid_utf8(bytes: Seq<u8>, i: int)
732    requires
733        0 <= i <= bytes.len(),
734        valid_utf8(bytes.subrange(0, i)),
735        !valid_utf8(bytes.subrange(i, bytes.len() as int)),
736    ensures
737        !valid_utf8(bytes),
738{
739    partial_valid_utf8_invalid_subrange_helper(bytes, i, 0);
740    assert(bytes.subrange(0, bytes.len() as int) =~= bytes);
741}
742
743proof fn partial_valid_utf8_invalid_subrange_helper(bytes: Seq<u8>, i: int, j: int)
744    requires
745        0 <= j <= i <= bytes.len(),
746        valid_utf8(bytes.subrange(0, i)),
747        !valid_utf8(bytes.subrange(i, bytes.len() as int)),
748        valid_utf8(bytes.subrange(0, j)),
749        valid_utf8(bytes.subrange(j, i)),
750    ensures
751        !valid_utf8(bytes.subrange(j, bytes.len() as int)),
752    decreases (bytes.len() - j),
753{
754    if j == i {
755    } else {
756        let bytes_j = bytes.subrange(j, bytes.len() as int);
757        if valid_first_scalar(bytes_j) {
758            partial_valid_utf8_extend(bytes, j);
759            let k = length_of_first_scalar(bytes_j);
760
761            assert(pop_first_scalar(bytes.subrange(j, i)) == bytes.subrange(j + k, i));
762
763            partial_valid_utf8_invalid_subrange_helper(bytes, i, j + k);
764
765            assert(bytes_j.subrange(k, bytes_j.len() as int) == bytes.subrange(
766                j + k,
767                bytes.len() as int,
768            ));
769        }
770    }
771}
772
773/// Ensures that concatenating two valid UTF-8 byte sequence results in a valid UTF-8 byte sequence.
774pub broadcast proof fn valid_utf8_concat(b1: Seq<u8>, b2: Seq<u8>)
775    requires
776        #[trigger] valid_utf8(b1),
777        #[trigger] valid_utf8(b2),
778    ensures
779        #[trigger] valid_utf8(b1 + b2),
780    decreases b1.len(),
781{
782    if b1.len() == 0 {
783        assert(b1 + b2 == b2) by { Seq::add_empty_left(b1, b2) };
784        assert(valid_utf8(b1 + b2));
785    } else {
786        let rest = pop_first_scalar(b1);
787        assert(pop_first_scalar(b1).len() < b1.len()) by { lemma_pop_first_scalar_decreases(b1) };
788        valid_utf8_concat(rest, b2);
789        assert(pop_first_scalar(b1 + b2) =~= rest + b2);
790        assert(valid_utf8(b1 + b2));
791    }
792}
793
794/// Ensures that if the prefix of a byte sequence is valid UTF-8, and remainder of the sequence begins with a valid UTF-8 encoding of a single scalar,
795/// then the prefix extended by that scalar encoding is also valid UTF-8.
796pub broadcast proof fn partial_valid_utf8_extend(bytes: Seq<u8>, i: int)
797    requires
798        #[trigger] partial_valid_utf8(bytes, i),
799        #[trigger] valid_first_scalar(bytes.subrange(i, bytes.len() as int)),
800    ensures
801        #[trigger] partial_valid_utf8(
802            bytes,
803            i + length_of_first_scalar(bytes.subrange(i, bytes.len() as int)),
804        ),
805{
806    reveal_with_fuel(valid_utf8, 2);
807    let scalar = bytes.subrange(
808        i,
809        i + length_of_first_scalar(bytes.subrange(i, bytes.len() as int)),
810    );
811    valid_utf8_concat(bytes.subrange(0, i), scalar);
812    assert(bytes.subrange(0, i) + scalar =~= bytes.subrange(
813        0,
814        i + length_of_first_scalar(bytes.subrange(i, bytes.len() as int)),
815    ));
816}
817
818/// Ensures that if the prefix of a byte sequence is valid UTF-8, and remainder of the sequence begins with a subsequence of valid UTF-8 encodings for 1-byte scalars (i.e. ASCII characters),
819/// then the prefix extended by that subsequence is also valid UTF-8.
820pub broadcast proof fn partial_valid_utf8_extend_ascii_block(bytes: Seq<u8>, start: int, end: int)
821    requires
822        forall|i: int|
823            0 <= start <= i < end <= bytes.len() ==> #[trigger] is_leading_byte_width_1(bytes[i]),
824        partial_valid_utf8(bytes, start),
825        0 <= start <= end <= bytes.len(),
826    ensures
827        #![trigger partial_valid_utf8(bytes, start), partial_valid_utf8(bytes, end)]
828        partial_valid_utf8(bytes, end),
829    decreases end - start,
830{
831    if end == start {
832    } else {
833        partial_valid_utf8_extend_ascii_block(bytes, start, end - 1);
834
835        let b = bytes[end - 1];
836        assert(is_leading_byte_width_1(b));
837        partial_valid_utf8_extend(bytes, end - 1);
838    }
839}
840
841/* Reasoning about character boundaries */
842
843/// True when the given index into the byte sequence is the first byte of a character's encoding or the end of the sequence, assuming that the sequence is valid UTF-8.
844pub open spec fn is_char_boundary(bytes: Seq<u8>, index: int) -> bool
845    recommends
846        valid_utf8(bytes),
847    decreases bytes.len(),
848    when valid_utf8(bytes)
849{
850    if index == 0 {
851        true
852    } else if index < 0 || bytes.len() < index {
853        false
854    } else {
855        is_char_boundary(pop_first_scalar(bytes), index - length_of_first_scalar(bytes))
856    }
857}
858
859proof fn take_first_scalar_valid_utf8(bytes: Seq<u8>)
860    requires
861        valid_utf8(bytes),
862    ensures
863        bytes.len() > 0 ==> valid_utf8(take_first_scalar(bytes)),
864{
865    reveal_with_fuel(valid_utf8, 2);
866}
867
868/// Ensures that the two subsequences formed by splitting a valid UTF-8 byte sequence at a character boundary are also valid UTF-8 byte sequences.
869pub broadcast proof fn valid_utf8_split(bytes: Seq<u8>, index: int)
870    requires
871        valid_utf8(bytes),
872        is_char_boundary(bytes, index),
873    ensures
874        #![trigger valid_utf8(bytes.subrange(0, index)), is_char_boundary(bytes, index)]
875        #![trigger valid_utf8(bytes.subrange(index, bytes.len() as int)), is_char_boundary(bytes, index)]
876        valid_utf8(bytes.subrange(0, index)),
877        valid_utf8(bytes.subrange(index, bytes.len() as int)),
878    decreases bytes.len(),
879{
880    if index == 0 {
881        assert(bytes =~= bytes.subrange(index, bytes.len() as int));
882    } else {
883        broadcast use axiom_seq_subrange_len;
884
885        let s1 = bytes.subrange(0, index);
886        let s2 = bytes.subrange(index, bytes.len() as int);
887        let head = take_first_scalar(bytes);
888        let tail = pop_first_scalar(bytes);
889        let new_offset = index - length_of_first_scalar(bytes);
890        // recursive call: show valid on split for tail
891        valid_utf8_split(tail, new_offset);
892        let n1 = tail.subrange(0, new_offset);
893        let n2 = tail.subrange(new_offset, tail.len() as int);
894        // now we need to concatenate the head back on
895        assert(s1 =~= head + n1) by {
896            assert(s1.len() == head.len() + n1.len()) by {
897                // to use subrange len axiom, we need to show that new_offset is in bounds for tail
898                is_char_boundary_len_first_scalar(bytes, index);
899            }
900        }
901        assert(valid_utf8(head + n1)) by {
902            take_first_scalar_valid_utf8(bytes);
903            valid_utf8_concat(head, n1);
904        }
905        assert(s2 =~= n2);
906    }
907}
908
909/// Ensures that a valid UTF-8 byte sequence can be decoded by separately decoding the two subsequences formed by splitting the original sequence at a character boundary.
910pub broadcast proof fn decode_utf8_split(bytes: Seq<u8>, index: int)
911    requires
912        valid_utf8(bytes),
913        is_char_boundary(bytes, index),
914    ensures
915        #![trigger decode_utf8(bytes.subrange(0, index)), is_char_boundary(bytes, index)]
916        #![trigger decode_utf8(bytes.subrange(index, bytes.len() as int)), is_char_boundary(bytes, index)]
917        decode_utf8(bytes) =~= decode_utf8(bytes.subrange(0, index)) + decode_utf8(
918            bytes.subrange(index, bytes.len() as int),
919        ),
920    decreases index,
921{
922    if index == 0 {
923        assert(bytes.subrange(index, bytes.len() as int) =~= bytes);
924    } else {
925        let first = bytes.subrange(0, index);
926        let second = bytes.subrange(index, bytes.len() as int);
927        is_char_boundary_len_first_scalar(bytes, index);
928        valid_utf8_split(bytes, index);
929        let bytes_tail = pop_first_scalar(bytes);
930        let first_tail = pop_first_scalar(first);
931        let bytes_head = decode_first_scalar(bytes) as char;
932        let first_head = decode_first_scalar(first) as char;
933        let new_index = (index - length_of_first_scalar(bytes)) as int;
934        decode_utf8_split(bytes_tail, new_index);
935        assert(second =~= bytes_tail.subrange(new_index, bytes_tail.len() as int));
936        assert(first_tail =~= bytes_tail.subrange(0, new_index));
937    }
938}
939
940proof fn is_char_boundary_len_first_scalar(bytes: Seq<u8>, index: int)
941    requires
942        valid_utf8(bytes),
943        is_char_boundary(bytes, index),
944    ensures
945        index > 0 ==> index >= length_of_first_scalar(bytes),
946{
947    reveal_with_fuel(is_char_boundary, 2);
948}
949
950/// Ensures that the start and end of a valid UTF-8 byte sequence are character boundaries.
951pub broadcast proof fn is_char_boundary_start_end_of_seq(bytes: Seq<u8>)
952    requires
953        valid_utf8(bytes),
954    ensures
955        #![trigger is_char_boundary(bytes, 0)]
956        #![trigger is_char_boundary(bytes, bytes.len() as int)]
957        is_char_boundary(bytes, 0),
958        is_char_boundary(bytes, bytes.len() as int),
959    decreases bytes.len(),
960{
961    if bytes.len() == 0 {
962    } else {
963        is_char_boundary_start_end_of_seq(pop_first_scalar(bytes));
964    }
965}
966
967/// Ensures that any byte in a valid UTF-8 byte sequence falls on a character boundary (i.e. the first byte in a codepoint's encoding) if and only if it does not have the form of a UTF-8 continuation byte.
968pub broadcast proof fn is_char_boundary_iff_not_is_continuation_byte(bytes: Seq<u8>, index: int)
969    requires
970        valid_utf8(bytes),
971        0 <= index < bytes.len(),
972    ensures
973        #[trigger] is_char_boundary(bytes, index) <==> !(#[trigger] is_continuation_byte(
974            bytes[index],
975        )),
976    decreases bytes.len(),
977{
978    if 0 <= index < length_of_first_scalar(bytes) {
979        reveal_with_fuel(is_char_boundary, 2);
980    } else {
981        is_char_boundary_iff_not_is_continuation_byte(
982            pop_first_scalar(bytes),
983            index - length_of_first_scalar(bytes),
984        );
985    }
986}
987
988/// Ensures that any byte in a valid UTF-8 byte sequence falls on a character boundary (i.e. the first byte in a codepoint's encoding) if and only if it has the form of a UTF-8 leading byte.
989pub broadcast proof fn is_char_boundary_iff_is_leading_byte(bytes: Seq<u8>, index: int)
990    requires
991        valid_utf8(bytes),
992        0 <= index < bytes.len(),
993    ensures
994        #![trigger is_char_boundary(bytes, index), is_leading_byte_width_1(bytes[index])]
995        #![trigger is_char_boundary(bytes, index), is_leading_byte_width_2(bytes[index])]
996        #![trigger is_char_boundary(bytes, index), is_leading_byte_width_3(bytes[index])]
997        #![trigger is_char_boundary(bytes, index), is_leading_byte_width_4(bytes[index])]
998        is_char_boundary(bytes, index) <==> (is_leading_byte_width_1(bytes[index])
999            || is_leading_byte_width_2(bytes[index]) || is_leading_byte_width_3(bytes[index])
1000            || is_leading_byte_width_4(bytes[index])),
1001    decreases bytes.len(),
1002{
1003    if 0 <= index < length_of_first_scalar(bytes) {
1004        reveal_with_fuel(is_char_boundary, 2);
1005    } else {
1006        is_char_boundary_iff_is_leading_byte(
1007            pop_first_scalar(bytes),
1008            index - length_of_first_scalar(bytes),
1009        );
1010    }
1011}
1012
1013pub broadcast proof fn valid_utf8_last(s: Seq<u8>)
1014    requires
1015        valid_utf8(s),
1016        s.len() > 0,
1017    ensures
1018        #![trigger is_continuation_byte(s.last())]
1019        #![trigger is_leading_byte_width_1(s.last())]
1020        !is_continuation_byte(s.last()) ==> is_leading_byte_width_1(s.last()),
1021    decreases s.len(),
1022{
1023    // this proof must be discharged recursively, since valid_utf8 only tells you information
1024    // about the first codepoint and recurses from there
1025    let first = decode_first_scalar(s);
1026    let rest = pop_first_scalar(s);
1027
1028    if rest.len() == 0 {
1029        if s.len() > 1 {
1030            assert(is_continuation_byte(s[s.len() - 1]));
1031        }
1032    } else {
1033        valid_utf8_last(rest);
1034    }
1035}
1036
1037/* Bit-level reasoning */
1038
1039/// Formulates the byte ranges for each type of byte in UTF-8 (leading and continuation) in terms of bitwise operators instead of ranges.
1040pub broadcast proof fn utf8_byte_ranges_bitwise(b: u8)
1041    by (bit_vector)
1042    ensures
1043        #![trigger b & 0x80]
1044        #![trigger b & 0xf0]
1045        #![trigger b & 0xf8]
1046        #![trigger b & 0xe0]
1047        #![trigger b & 0xc0]
1048        0x00 <= b <= 0x7f <==> b & 0x80 == 0,
1049        0xc0 <= b <= 0xdf <==> b & 0xe0 == 0xc0,
1050        0xe0 <= b <= 0xef <==> b & 0xf0 == 0xe0,
1051        0xf0 <= b <= 0xf7 <==> b & 0xf8 == 0xf0,
1052        0x80 <= b <= 0xbf <==> b & 0xc0 == 0x80,
1053{
1054}
1055
1056/* ASCII */
1057
1058/// True when the given character sequence only contains ASCII characters.
1059pub open spec fn is_ascii_chars(chars: Seq<char>) -> bool {
1060    forall|i| 0 <= i < chars.len() ==> '\0' <= #[trigger] chars[i] <= '\u{7f}'
1061}
1062
1063/// Ensures that the UTF-8 encoding for an ASCII character sequence has the same length of the original sequence and corresponds byte-by-byte to the characters in the original sequence.
1064pub broadcast proof fn is_ascii_chars_encode_utf8(chars: Seq<char>)
1065    requires
1066        #[trigger] is_ascii_chars(chars),
1067    ensures
1068        chars.len() == encode_utf8(chars).len(),
1069        forall|i|
1070            #![trigger chars[i]]
1071            #![trigger encode_utf8(chars)[i]]
1072            0 <= i < chars.len() ==> chars[i] as u8 == encode_utf8(chars)[i],
1073    decreases chars.len(),
1074{
1075    if chars.len() == 0 {
1076    } else {
1077        let c0 = chars[0] as u32;
1078        assert(c0 as u8 == leading_byte_width_1(c0)) by (bit_vector)
1079            requires
1080                has_width_1_encoding(c0),
1081        ;
1082        is_ascii_chars_encode_utf8(chars.drop_first());
1083    }
1084}
1085
1086/// Ensures that all characters in an ASCII character sequence have a numerical representation that falls in the range 0 (inclusive) to 128 (exclusive).
1087pub broadcast proof fn is_ascii_chars_nat_bound(chars: Seq<char>)
1088    ensures
1089        #[trigger] is_ascii_chars(chars) ==> forall|i: int|
1090            0 <= i < chars.len() ==> (chars.index(i) as nat) < 128,
1091{
1092}
1093
1094/// Ensures that an ASCII character sequence is formed by the concatenation of two ASCII character sequences.
1095pub broadcast proof fn is_ascii_chars_concat(c1: Seq<char>, c2: Seq<char>, c3: Seq<char>)
1096    requires
1097        c1 =~= c2 + c3,
1098    ensures
1099        #![trigger c2 + c3, is_ascii_chars(c1), is_ascii_chars(c2), is_ascii_chars(c3)]
1100        is_ascii_chars(c1) <==> is_ascii_chars(c2) && is_ascii_chars(c3),
1101{
1102    if (is_ascii_chars(c1)) {
1103        assert(c2 =~= c1.subrange(0, c2.len() as int));
1104        assert(c3 =~= c1.subrange(c2.len() as int, c1.len() as int));
1105    }
1106}
1107
1108pub broadcast group group_utf8_lib {
1109    encode_utf8_valid_utf8,
1110    encode_utf8_decode_utf8,
1111    decode_utf8_encode_utf8,
1112    char_is_scalar,
1113    char_u32_cast,
1114    valid_utf8_concat,
1115    partial_valid_utf8_extend,
1116    partial_valid_utf8_extend_ascii_block,
1117    valid_utf8_split,
1118    decode_utf8_split,
1119    is_char_boundary_start_end_of_seq,
1120    is_char_boundary_iff_not_is_continuation_byte,
1121    is_char_boundary_iff_is_leading_byte,
1122    valid_utf8_last,
1123    utf8_byte_ranges_bitwise,
1124    is_ascii_chars_encode_utf8,
1125    is_ascii_chars_nat_bound,
1126    is_ascii_chars_concat,
1127}
1128
1129} // verus!