1use super::prelude::*;
29use super::seq::*;
30
31verus! {
32
33broadcast use super::seq::group_seq_axioms;
34pub open spec fn is_leading_byte_width_1(byte: u8) -> bool {
39 0x00 <= byte <= 0x7f
40}
41
42pub open spec fn is_leading_byte_width_2(byte: u8) -> bool {
45 0xc0 <= byte <= 0xdf
46}
47
48pub open spec fn is_leading_byte_width_3(byte: u8) -> bool {
51 0xe0 <= byte <= 0xef
52}
53
54pub open spec fn is_leading_byte_width_4(byte: u8) -> bool {
57 0xf0 <= byte <= 0xf7
58}
59
60pub open spec fn is_continuation_byte(byte: u8) -> bool {
63 0x80 <= byte <= 0xbf
64}
65
66pub open spec fn continuation_bits(byte: u8) -> u32
68 recommends
69 is_continuation_byte(byte),
70{
71 (byte & 0x3f) as u32
73}
74
75pub open spec fn leading_bits_width_1(byte: u8) -> u32
77 recommends
78 is_leading_byte_width_1(byte),
79{
80 (byte & 0x7F) as u32
82}
83
84pub open spec fn leading_bits_width_2(byte: u8) -> u32
86 recommends
87 is_leading_byte_width_2(byte),
88{
89 (byte & 0x1F) as u32
91}
92
93pub open spec fn leading_bits_width_3(byte: u8) -> u32
95 recommends
96 is_leading_byte_width_3(byte),
97{
98 (byte & 0x0F) as u32
100}
101
102pub open spec fn leading_bits_width_4(byte: u8) -> u32
104 recommends
105 is_leading_byte_width_4(byte),
106{
107 (byte & 0x07) as u32
109}
110
111pub open spec fn codepoint_width_1(byte1: u8) -> u32
113 recommends
114 is_leading_byte_width_1(byte1),
115{
116 leading_bits_width_1(byte1)
117}
118
119pub open spec fn codepoint_width_2(byte1: u8, byte2: u8) -> u32
126 recommends
127 is_leading_byte_width_2(byte1),
128 is_continuation_byte(byte2),
129{
130 (leading_bits_width_2(byte1) << 6) | continuation_bits(byte2)
131}
132
133pub open spec fn codepoint_width_3(byte1: u8, byte2: u8, byte3: u8) -> u32
135 recommends
136 is_leading_byte_width_3(byte1),
137 is_continuation_byte(byte2),
138 is_continuation_byte(byte3),
139{
140 (leading_bits_width_3(byte1) << 12) | (continuation_bits(byte2) << 6) | continuation_bits(byte3)
141}
142
143pub open spec fn codepoint_width_4(byte1: u8, byte2: u8, byte3: u8, byte4: u8) -> u32
151 recommends
152 is_leading_byte_width_4(byte1),
153 is_continuation_byte(byte2),
154 is_continuation_byte(byte3),
155 is_continuation_byte(byte4),
156{
157 (leading_bits_width_4(byte1) << 18) | (continuation_bits(byte2) << 12) | (continuation_bits(
158 byte3,
159 ) << 6) | continuation_bits(byte4)
160}
161
162pub open spec fn valid_leading_and_continuation_bytes_first_codepoint(bytes: Seq<u8>) -> bool {
164 ||| (bytes.len() >= 1 && is_leading_byte_width_1(bytes[0]))
165 ||| (bytes.len() >= 2 && is_leading_byte_width_2(bytes[0]) && is_continuation_byte(bytes[1]))
166 ||| (bytes.len() >= 3 && is_leading_byte_width_3(bytes[0]) && is_continuation_byte(bytes[1])
167 && is_continuation_byte(bytes[2]))
168 ||| (bytes.len() >= 4 && is_leading_byte_width_4(bytes[0]) && is_continuation_byte(bytes[1])
169 && is_continuation_byte(bytes[2]) && is_continuation_byte(bytes[3]))
170}
171
172pub open spec fn decode_first_codepoint(bytes: Seq<u8>) -> u32
174 recommends
175 valid_leading_and_continuation_bytes_first_codepoint(bytes),
176{
177 if is_leading_byte_width_1(bytes[0]) {
178 codepoint_width_1(bytes[0])
179 } else if is_leading_byte_width_2(bytes[0]) {
180 codepoint_width_2(bytes[0], bytes[1])
181 } else if is_leading_byte_width_3(bytes[0]) {
182 codepoint_width_3(bytes[0], bytes[1], bytes[2])
183 } else {
184 codepoint_width_4(bytes[0], bytes[1], bytes[2], bytes[3])
185 }
186}
187
188pub open spec fn length_of_first_codepoint(bytes: Seq<u8>) -> int
190 recommends
191 valid_leading_and_continuation_bytes_first_codepoint(bytes),
192{
193 if is_leading_byte_width_1(bytes[0]) {
194 1
195 } else if is_leading_byte_width_2(bytes[0]) {
196 2
197 } else if is_leading_byte_width_3(bytes[0]) {
198 3
199 } else {
200 4
201 }
202}
203
204pub open spec fn not_overlong_encoding(codepoint: u32, len: int) -> bool {
207 &&& (len == 2 ==> 0x80 <= codepoint)
208 &&& (len == 3 ==> 0x800 <= codepoint)
209 &&& (len == 4 ==> 0x10000 <= codepoint <= 0x10ffff)
210}
211
212pub open spec fn not_surrogate(codepoint: u32) -> bool {
215 !(0xD800 <= codepoint <= 0xDFFF)
216}
217
218pub open spec fn valid_first_scalar(bytes: Seq<u8>) -> bool {
222 &&& valid_leading_and_continuation_bytes_first_codepoint(bytes)
223 &&& not_overlong_encoding(decode_first_codepoint(bytes), length_of_first_codepoint(bytes))
224 &&& not_surrogate(decode_first_codepoint(bytes))
225}
226
227pub open spec fn decode_first_scalar(bytes: Seq<u8>) -> u32
229 recommends
230 valid_first_scalar(bytes),
231{
232 decode_first_codepoint(bytes)
233}
234
235pub open spec fn length_of_first_scalar(bytes: Seq<u8>) -> int
237 recommends
238 valid_first_scalar(bytes),
239{
240 length_of_first_codepoint(bytes)
241}
242
243pub open spec fn pop_first_scalar(bytes: Seq<u8>) -> Seq<u8>
245 recommends
246 valid_first_scalar(bytes),
247{
248 bytes.subrange(length_of_first_scalar(bytes), bytes.len() as int)
249}
250
251proof fn lemma_pop_first_scalar_decreases(bytes: Seq<u8>)
252 requires
253 valid_first_scalar(bytes),
254 ensures
255 pop_first_scalar(bytes).len() < bytes.len(),
256{
257 assert(length_of_first_scalar(bytes) <= bytes.len() as int);
258 assert(pop_first_scalar(bytes).len() == bytes.len() as int - length_of_first_scalar(bytes)) by {
259 axiom_seq_subrange_len(bytes, length_of_first_scalar(bytes), bytes.len() as int)
260 };
261}
262
263pub open spec fn take_first_scalar(bytes: Seq<u8>) -> Seq<u8>
265 recommends
266 valid_first_scalar(bytes),
267{
268 bytes.subrange(0, length_of_first_scalar(bytes))
269}
270
271pub open spec fn valid_utf8(bytes: Seq<u8>) -> bool
273 decreases bytes.len(),
274{
275 bytes.len() != 0 ==> valid_first_scalar(bytes) && valid_utf8(pop_first_scalar(bytes))
276}
277
278pub open spec fn decode_utf8(bytes: Seq<u8>) -> Seq<char>
280 recommends
281 valid_utf8(bytes),
282 decreases bytes.len(),
283 when valid_utf8(bytes)
284{
285 if bytes.len() == 0 {
286 seq![]
287 } else {
288 seq![decode_first_scalar(bytes) as char] + decode_utf8(pop_first_scalar(bytes))
289 }
290}
291
292pub open spec fn length_of_last_scalar(bytes: Seq<u8>) -> int
294 recommends
295 valid_utf8(bytes),
296 bytes.len() > 0,
297{
298 let n = bytes.len() as int;
299 if !is_continuation_byte(bytes[n - 1]) {
300 1
301 } else if !is_continuation_byte(bytes[n - 2]) {
302 2
303 } else if !is_continuation_byte(bytes[n - 3]) {
304 3
305 } else {
306 4
307 }
308}
309
310pub open spec fn take_last_scalar(bytes: Seq<u8>) -> Seq<u8>
312 recommends
313 valid_utf8(bytes),
314 bytes.len() > 0,
315{
316 let len = length_of_last_scalar(bytes);
317 bytes.subrange(bytes.len() - len, bytes.len() as int)
318}
319
320pub open spec fn decode_last_scalar(bytes: Seq<u8>) -> u32
322 recommends
323 valid_utf8(bytes),
324 bytes.len() > 0,
325{
326 let n = bytes.len() as int;
327 if !is_continuation_byte(bytes[n - 1]) {
328 codepoint_width_1(bytes[n - 1])
329 } else if !is_continuation_byte(bytes[n - 2]) {
330 codepoint_width_2(bytes[n - 2], bytes[n - 1])
331 } else if !is_continuation_byte(bytes[n - 3]) {
332 codepoint_width_3(bytes[n - 3], bytes[n - 2], bytes[n - 1])
333 } else {
334 codepoint_width_4(bytes[n - 4], bytes[n - 3], bytes[n - 2], bytes[n - 1])
335 }
336}
337
338pub open spec fn has_width_1_encoding(v: u32) -> bool {
342 0 <= v <= 0x7F
343}
344
345pub open spec fn has_width_2_encoding(v: u32) -> bool {
347 0x80 <= v <= 0x7FF
348}
349
350pub open spec fn has_width_3_encoding(v: u32) -> bool {
352 0x800 <= v <= 0xFFFF && !(0xD800 <= v <= 0xDFFF)
353}
354
355pub open spec fn has_width_4_encoding(v: u32) -> bool {
357 0x10000 <= v <= 0x10FFFF
358}
359
360pub open spec fn is_scalar(v: u32) -> bool {
363 ||| has_width_1_encoding(v)
364 ||| has_width_2_encoding(v)
365 ||| has_width_3_encoding(v)
366 ||| has_width_4_encoding(v)
367}
368
369pub open spec fn leading_byte_width_1(scalar: u32) -> u8
371 recommends
372 has_width_1_encoding(scalar),
373{
374 (scalar & 0x7F) as u8
375}
376
377pub open spec fn leading_byte_width_2(scalar: u32) -> u8
379 recommends
380 has_width_2_encoding(scalar),
381{
382 0xC0 | ((scalar >> 6) & 0x1F) as u8
383}
384
385pub open spec fn leading_byte_width_3(scalar: u32) -> u8
387 recommends
388 has_width_3_encoding(scalar),
389{
390 0xE0 | ((scalar >> 12) & 0x0F) as u8
391}
392
393pub open spec fn leading_byte_width_4(scalar: u32) -> u8
395 recommends
396 has_width_4_encoding(scalar),
397{
398 0xF0 | ((scalar >> 18) & 0x7) as u8
399}
400
401pub open spec fn last_continuation_byte(scalar: u32) -> u8
403 recommends
404 has_width_2_encoding(scalar) || has_width_3_encoding(scalar) || has_width_4_encoding(
405 scalar,
406 ),
407{
408 0x80 | (scalar & 0x3F) as u8
409}
410
411pub open spec fn second_last_continuation_byte(scalar: u32) -> u8
413 recommends
414 has_width_3_encoding(scalar) || has_width_4_encoding(scalar),
415{
416 0x80 | ((scalar >> 6) & 0x3F) as u8
417}
418
419pub open spec fn third_last_continuation_byte(scalar: u32) -> u8
421 recommends
422 has_width_4_encoding(scalar),
423{
424 0x80 | ((scalar >> 12) & 0x3F) as u8
425}
426
427pub open spec fn encode_scalar(scalar: u32) -> Seq<u8>
429 recommends
430 is_scalar(scalar),
431{
432 if has_width_1_encoding(scalar) {
433 seq![leading_byte_width_1(scalar)]
434 } else if has_width_2_encoding(scalar) {
435 seq![leading_byte_width_2(scalar), last_continuation_byte(scalar)]
436 } else if has_width_3_encoding(scalar) {
437 seq![
438 leading_byte_width_3(scalar),
439 second_last_continuation_byte(scalar),
440 last_continuation_byte(scalar),
441 ]
442 } else {
443 seq![
444 leading_byte_width_4(scalar),
445 third_last_continuation_byte(scalar),
446 second_last_continuation_byte(scalar),
447 last_continuation_byte(scalar),
448 ]
449 }
450}
451
452pub open spec fn encode_utf8(chars: Seq<char>) -> Seq<u8>
454 decreases chars.len(),
455{
456 if chars.len() == 0 {
457 seq![]
458 } else {
459 encode_scalar(chars[0] as u32) + encode_utf8(chars.drop_first())
460 }
461}
462
463proof fn encode_decode_width_1(c: u32)
467 by (bit_vector)
468 requires
469 has_width_1_encoding(c),
470 ensures
471 ({
472 let b1 = leading_byte_width_1(c);
473 &&& is_leading_byte_width_1(b1)
474 &&& codepoint_width_1(b1) == c
475 }),
476{
477}
478
479proof fn decode_encode_width_1(b1: u8)
481 by (bit_vector)
482 requires
483 is_leading_byte_width_1(b1),
484 ensures
485 ({
486 let c = codepoint_width_1(b1);
487 &&& has_width_1_encoding(c)
488 &&& leading_byte_width_1(c) == b1
489 }),
490{
491}
492
493proof fn encode_decode_width_2(c: u32)
495 by (bit_vector)
496 requires
497 has_width_2_encoding(c),
498 ensures
499 ({
500 let b1 = leading_byte_width_2(c);
501 let b2 = last_continuation_byte(c);
502 &&& is_leading_byte_width_2(b1)
503 &&& is_continuation_byte(b2)
504 &&& codepoint_width_2(b1, b2) == c
505 }),
506{
507}
508
509proof fn decode_encode_width_2(b1: u8, b2: u8)
511 by (bit_vector)
512 requires
513 is_leading_byte_width_2(b1),
514 is_continuation_byte(b2),
515 not_overlong_encoding(codepoint_width_2(b1, b2), 2),
516 ensures
517 ({
518 let c = codepoint_width_2(b1, b2);
519 &&& has_width_2_encoding(c)
520 &&& leading_byte_width_2(c) == b1
521 &&& last_continuation_byte(c) == b2
522 }),
523{
524}
525
526proof fn encode_decode_width_3(c: u32)
528 by (bit_vector)
529 requires
530 has_width_3_encoding(c),
531 ensures
532 ({
533 let b1 = leading_byte_width_3(c);
534 let b2 = second_last_continuation_byte(c);
535 let b3 = last_continuation_byte(c);
536 &&& is_leading_byte_width_3(b1)
537 &&& is_continuation_byte(b2)
538 &&& is_continuation_byte(b3)
539 &&& codepoint_width_3(b1, b2, b3) == c
540 }),
541{
542}
543
544proof fn decode_encode_width_3(b1: u8, b2: u8, b3: u8)
546 by (bit_vector)
547 requires
548 is_leading_byte_width_3(b1),
549 is_continuation_byte(b2),
550 is_continuation_byte(b3),
551 not_overlong_encoding(codepoint_width_3(b1, b2, b3), 3),
552 not_surrogate(codepoint_width_3(b1, b2, b3)),
553 ensures
554 ({
555 let c = codepoint_width_3(b1, b2, b3);
556 &&& has_width_3_encoding(c)
557 &&& leading_byte_width_3(c) == b1
558 &&& second_last_continuation_byte(c) == b2
559 &&& last_continuation_byte(c) == b3
560 }),
561{
562}
563
564proof fn encode_decode_width_4(c: u32)
566 by (bit_vector)
567 requires
568 has_width_4_encoding(c),
569 ensures
570 ({
571 let b1 = leading_byte_width_4(c);
572 let b2 = third_last_continuation_byte(c);
573 let b3 = second_last_continuation_byte(c);
574 let b4 = last_continuation_byte(c);
575 &&& is_leading_byte_width_4(b1)
576 &&& is_continuation_byte(b2)
577 &&& is_continuation_byte(b3)
578 &&& is_continuation_byte(b4)
579 &&& codepoint_width_4(b1, b2, b3, b4) == c
580 }),
581{
582}
583
584proof fn decode_encode_width_4(b1: u8, b2: u8, b3: u8, b4: u8)
586 by (bit_vector)
587 requires
588 is_leading_byte_width_4(b1),
589 is_continuation_byte(b2),
590 is_continuation_byte(b3),
591 is_continuation_byte(b4),
592 not_overlong_encoding(codepoint_width_4(b1, b2, b3, b4), 4),
593 ensures
594 ({
595 let c = codepoint_width_4(b1, b2, b3, b4);
596 &&& has_width_4_encoding(c)
597 &&& leading_byte_width_4(c) == b1
598 &&& third_last_continuation_byte(c) == b2
599 &&& second_last_continuation_byte(c) == b3
600 &&& last_continuation_byte(c) == b4
601 }),
602{
603}
604
605pub broadcast proof fn char_is_scalar(c: char)
607 ensures
608 is_scalar(#[trigger] (c as u32)),
609{
610}
611
612pub broadcast proof fn char_u32_cast(c: char, u: u32)
614 requires
615 u == #[trigger] (c as u32),
616 ensures
617 #[trigger] (u as char) == c,
618{
619}
620
621pub proof fn encode_utf8_first_scalar(chars: Seq<char>)
623 requires
624 chars.len() > 0,
625 ensures
626 decode_first_scalar(encode_utf8(chars)) == chars[0] as u32,
627 length_of_first_scalar(encode_utf8(chars)) == encode_scalar(chars[0] as u32).len(),
628 valid_first_scalar(encode_utf8(chars)),
629{
630 char_is_scalar(chars[0]);
631 let s = chars[0] as u32;
632 if has_width_1_encoding(s) {
633 encode_decode_width_1(s);
634 } else if has_width_2_encoding(s) {
635 encode_decode_width_2(s);
636 } else if has_width_3_encoding(s) {
637 encode_decode_width_3(s);
638 } else {
639 encode_decode_width_4(s);
640 }
641}
642
643pub broadcast proof fn encode_utf8_valid_utf8(chars: Seq<char>)
645 ensures
646 valid_utf8(#[trigger] encode_utf8(chars)),
647 decreases chars.len(),
648{
649 if chars.len() == 0 {
650 } else {
651 let bytes = encode_utf8(chars);
652 encode_utf8_first_scalar(chars);
653 assert(pop_first_scalar(bytes) =~= encode_utf8(chars.drop_first()));
654 encode_utf8_valid_utf8(chars.drop_first());
655 }
656}
657
658pub broadcast proof fn encode_utf8_decode_utf8(chars: Seq<char>)
660 ensures
661 #[trigger] decode_utf8(encode_utf8(chars)) == chars,
662 decreases chars.len(),
663{
664 broadcast use encode_utf8_valid_utf8;
665
666 if chars.len() == 0 {
667 } else {
668 let bytes = encode_utf8(chars);
669 encode_utf8_first_scalar(chars);
670 char_u32_cast(chars[0], decode_first_scalar(bytes));
671
672 assert(pop_first_scalar(bytes) =~= encode_utf8(chars.drop_first()));
673 let rest = chars.drop_first();
674 encode_utf8_decode_utf8(rest);
675 }
676}
677
678pub proof fn decode_utf8_first_scalar(bytes: Seq<u8>)
680 requires
681 valid_utf8(bytes),
682 bytes.len() > 0,
683 ensures
684 encode_scalar((decode_first_scalar(bytes) as char) as u32) == take_first_scalar(bytes),
685{
686 if is_leading_byte_width_1(bytes[0]) {
687 decode_encode_width_1(bytes[0]);
688 } else if is_leading_byte_width_2(bytes[0]) {
689 decode_encode_width_2(bytes[0], bytes[1]);
690 } else if is_leading_byte_width_3(bytes[0]) {
691 decode_encode_width_3(bytes[0], bytes[1], bytes[2]);
692 } else {
693 decode_encode_width_4(bytes[0], bytes[1], bytes[2], bytes[3]);
694 }
695}
696
697pub broadcast proof fn decode_utf8_encode_utf8(bytes: Seq<u8>)
699 requires
700 valid_utf8(bytes),
701 ensures
702 #[trigger] encode_utf8(decode_utf8(bytes)) == bytes,
703 decreases bytes.len(),
704{
705 broadcast use encode_utf8_valid_utf8;
706
707 if bytes.len() == 0 {
708 } else {
709 let chars = decode_utf8(bytes);
710 let first = decode_first_scalar(bytes) as char;
711 let rest = pop_first_scalar(bytes);
712
713 char_is_scalar(first);
714 assert(encode_scalar(first as u32) == take_first_scalar(bytes)) by {
715 decode_utf8_first_scalar(bytes);
716 }
717
718 assert(chars.drop_first() =~= decode_utf8(rest));
719 decode_utf8_encode_utf8(rest);
720 }
721}
722
723pub open spec fn partial_valid_utf8(bytes: Seq<u8>, i: int) -> bool {
727 0 <= i <= bytes.len() && valid_utf8(bytes.subrange(0, i))
728}
729
730pub proof fn partial_valid_partial_invalid_utf8(bytes: Seq<u8>, i: int)
732 requires
733 0 <= i <= bytes.len(),
734 valid_utf8(bytes.subrange(0, i)),
735 !valid_utf8(bytes.subrange(i, bytes.len() as int)),
736 ensures
737 !valid_utf8(bytes),
738{
739 partial_valid_utf8_invalid_subrange_helper(bytes, i, 0);
740 assert(bytes.subrange(0, bytes.len() as int) =~= bytes);
741}
742
743proof fn partial_valid_utf8_invalid_subrange_helper(bytes: Seq<u8>, i: int, j: int)
744 requires
745 0 <= j <= i <= bytes.len(),
746 valid_utf8(bytes.subrange(0, i)),
747 !valid_utf8(bytes.subrange(i, bytes.len() as int)),
748 valid_utf8(bytes.subrange(0, j)),
749 valid_utf8(bytes.subrange(j, i)),
750 ensures
751 !valid_utf8(bytes.subrange(j, bytes.len() as int)),
752 decreases (bytes.len() - j),
753{
754 if j == i {
755 } else {
756 let bytes_j = bytes.subrange(j, bytes.len() as int);
757 if valid_first_scalar(bytes_j) {
758 partial_valid_utf8_extend(bytes, j);
759 let k = length_of_first_scalar(bytes_j);
760
761 assert(pop_first_scalar(bytes.subrange(j, i)) == bytes.subrange(j + k, i));
762
763 partial_valid_utf8_invalid_subrange_helper(bytes, i, j + k);
764
765 assert(bytes_j.subrange(k, bytes_j.len() as int) == bytes.subrange(
766 j + k,
767 bytes.len() as int,
768 ));
769 }
770 }
771}
772
773pub broadcast proof fn valid_utf8_concat(b1: Seq<u8>, b2: Seq<u8>)
775 requires
776 #[trigger] valid_utf8(b1),
777 #[trigger] valid_utf8(b2),
778 ensures
779 #[trigger] valid_utf8(b1 + b2),
780 decreases b1.len(),
781{
782 if b1.len() == 0 {
783 assert(b1 + b2 == b2) by { Seq::add_empty_left(b1, b2) };
784 assert(valid_utf8(b1 + b2));
785 } else {
786 let rest = pop_first_scalar(b1);
787 assert(pop_first_scalar(b1).len() < b1.len()) by { lemma_pop_first_scalar_decreases(b1) };
788 valid_utf8_concat(rest, b2);
789 assert(pop_first_scalar(b1 + b2) =~= rest + b2);
790 assert(valid_utf8(b1 + b2));
791 }
792}
793
794pub broadcast proof fn partial_valid_utf8_extend(bytes: Seq<u8>, i: int)
797 requires
798 #[trigger] partial_valid_utf8(bytes, i),
799 #[trigger] valid_first_scalar(bytes.subrange(i, bytes.len() as int)),
800 ensures
801 #[trigger] partial_valid_utf8(
802 bytes,
803 i + length_of_first_scalar(bytes.subrange(i, bytes.len() as int)),
804 ),
805{
806 reveal_with_fuel(valid_utf8, 2);
807 let scalar = bytes.subrange(
808 i,
809 i + length_of_first_scalar(bytes.subrange(i, bytes.len() as int)),
810 );
811 valid_utf8_concat(bytes.subrange(0, i), scalar);
812 assert(bytes.subrange(0, i) + scalar =~= bytes.subrange(
813 0,
814 i + length_of_first_scalar(bytes.subrange(i, bytes.len() as int)),
815 ));
816}
817
818pub broadcast proof fn partial_valid_utf8_extend_ascii_block(bytes: Seq<u8>, start: int, end: int)
821 requires
822 forall|i: int|
823 0 <= start <= i < end <= bytes.len() ==> #[trigger] is_leading_byte_width_1(bytes[i]),
824 partial_valid_utf8(bytes, start),
825 0 <= start <= end <= bytes.len(),
826 ensures
827 #![trigger partial_valid_utf8(bytes, start), partial_valid_utf8(bytes, end)]
828 partial_valid_utf8(bytes, end),
829 decreases end - start,
830{
831 if end == start {
832 } else {
833 partial_valid_utf8_extend_ascii_block(bytes, start, end - 1);
834
835 let b = bytes[end - 1];
836 assert(is_leading_byte_width_1(b));
837 partial_valid_utf8_extend(bytes, end - 1);
838 }
839}
840
841pub open spec fn is_char_boundary(bytes: Seq<u8>, index: int) -> bool
845 recommends
846 valid_utf8(bytes),
847 decreases bytes.len(),
848 when valid_utf8(bytes)
849{
850 if index == 0 {
851 true
852 } else if index < 0 || bytes.len() < index {
853 false
854 } else {
855 is_char_boundary(pop_first_scalar(bytes), index - length_of_first_scalar(bytes))
856 }
857}
858
859proof fn take_first_scalar_valid_utf8(bytes: Seq<u8>)
860 requires
861 valid_utf8(bytes),
862 ensures
863 bytes.len() > 0 ==> valid_utf8(take_first_scalar(bytes)),
864{
865 reveal_with_fuel(valid_utf8, 2);
866}
867
868pub broadcast proof fn valid_utf8_split(bytes: Seq<u8>, index: int)
870 requires
871 valid_utf8(bytes),
872 is_char_boundary(bytes, index),
873 ensures
874 #![trigger valid_utf8(bytes.subrange(0, index)), is_char_boundary(bytes, index)]
875 #![trigger valid_utf8(bytes.subrange(index, bytes.len() as int)), is_char_boundary(bytes, index)]
876 valid_utf8(bytes.subrange(0, index)),
877 valid_utf8(bytes.subrange(index, bytes.len() as int)),
878 decreases bytes.len(),
879{
880 if index == 0 {
881 assert(bytes =~= bytes.subrange(index, bytes.len() as int));
882 } else {
883 broadcast use axiom_seq_subrange_len;
884
885 let s1 = bytes.subrange(0, index);
886 let s2 = bytes.subrange(index, bytes.len() as int);
887 let head = take_first_scalar(bytes);
888 let tail = pop_first_scalar(bytes);
889 let new_offset = index - length_of_first_scalar(bytes);
890 valid_utf8_split(tail, new_offset);
892 let n1 = tail.subrange(0, new_offset);
893 let n2 = tail.subrange(new_offset, tail.len() as int);
894 assert(s1 =~= head + n1) by {
896 assert(s1.len() == head.len() + n1.len()) by {
897 is_char_boundary_len_first_scalar(bytes, index);
899 }
900 }
901 assert(valid_utf8(head + n1)) by {
902 take_first_scalar_valid_utf8(bytes);
903 valid_utf8_concat(head, n1);
904 }
905 assert(s2 =~= n2);
906 }
907}
908
909pub broadcast proof fn decode_utf8_split(bytes: Seq<u8>, index: int)
911 requires
912 valid_utf8(bytes),
913 is_char_boundary(bytes, index),
914 ensures
915 #![trigger decode_utf8(bytes.subrange(0, index)), is_char_boundary(bytes, index)]
916 #![trigger decode_utf8(bytes.subrange(index, bytes.len() as int)), is_char_boundary(bytes, index)]
917 decode_utf8(bytes) =~= decode_utf8(bytes.subrange(0, index)) + decode_utf8(
918 bytes.subrange(index, bytes.len() as int),
919 ),
920 decreases index,
921{
922 if index == 0 {
923 assert(bytes.subrange(index, bytes.len() as int) =~= bytes);
924 } else {
925 let first = bytes.subrange(0, index);
926 let second = bytes.subrange(index, bytes.len() as int);
927 is_char_boundary_len_first_scalar(bytes, index);
928 valid_utf8_split(bytes, index);
929 let bytes_tail = pop_first_scalar(bytes);
930 let first_tail = pop_first_scalar(first);
931 let bytes_head = decode_first_scalar(bytes) as char;
932 let first_head = decode_first_scalar(first) as char;
933 let new_index = (index - length_of_first_scalar(bytes)) as int;
934 decode_utf8_split(bytes_tail, new_index);
935 assert(second =~= bytes_tail.subrange(new_index, bytes_tail.len() as int));
936 assert(first_tail =~= bytes_tail.subrange(0, new_index));
937 }
938}
939
940proof fn is_char_boundary_len_first_scalar(bytes: Seq<u8>, index: int)
941 requires
942 valid_utf8(bytes),
943 is_char_boundary(bytes, index),
944 ensures
945 index > 0 ==> index >= length_of_first_scalar(bytes),
946{
947 reveal_with_fuel(is_char_boundary, 2);
948}
949
950pub broadcast proof fn is_char_boundary_start_end_of_seq(bytes: Seq<u8>)
952 requires
953 valid_utf8(bytes),
954 ensures
955 #![trigger is_char_boundary(bytes, 0)]
956 #![trigger is_char_boundary(bytes, bytes.len() as int)]
957 is_char_boundary(bytes, 0),
958 is_char_boundary(bytes, bytes.len() as int),
959 decreases bytes.len(),
960{
961 if bytes.len() == 0 {
962 } else {
963 is_char_boundary_start_end_of_seq(pop_first_scalar(bytes));
964 }
965}
966
967pub broadcast proof fn is_char_boundary_iff_not_is_continuation_byte(bytes: Seq<u8>, index: int)
969 requires
970 valid_utf8(bytes),
971 0 <= index < bytes.len(),
972 ensures
973 #[trigger] is_char_boundary(bytes, index) <==> !(#[trigger] is_continuation_byte(
974 bytes[index],
975 )),
976 decreases bytes.len(),
977{
978 if 0 <= index < length_of_first_scalar(bytes) {
979 reveal_with_fuel(is_char_boundary, 2);
980 } else {
981 is_char_boundary_iff_not_is_continuation_byte(
982 pop_first_scalar(bytes),
983 index - length_of_first_scalar(bytes),
984 );
985 }
986}
987
988pub broadcast proof fn is_char_boundary_iff_is_leading_byte(bytes: Seq<u8>, index: int)
990 requires
991 valid_utf8(bytes),
992 0 <= index < bytes.len(),
993 ensures
994 #![trigger is_char_boundary(bytes, index), is_leading_byte_width_1(bytes[index])]
995 #![trigger is_char_boundary(bytes, index), is_leading_byte_width_2(bytes[index])]
996 #![trigger is_char_boundary(bytes, index), is_leading_byte_width_3(bytes[index])]
997 #![trigger is_char_boundary(bytes, index), is_leading_byte_width_4(bytes[index])]
998 is_char_boundary(bytes, index) <==> (is_leading_byte_width_1(bytes[index])
999 || is_leading_byte_width_2(bytes[index]) || is_leading_byte_width_3(bytes[index])
1000 || is_leading_byte_width_4(bytes[index])),
1001 decreases bytes.len(),
1002{
1003 if 0 <= index < length_of_first_scalar(bytes) {
1004 reveal_with_fuel(is_char_boundary, 2);
1005 } else {
1006 is_char_boundary_iff_is_leading_byte(
1007 pop_first_scalar(bytes),
1008 index - length_of_first_scalar(bytes),
1009 );
1010 }
1011}
1012
1013pub broadcast proof fn valid_utf8_last(s: Seq<u8>)
1014 requires
1015 valid_utf8(s),
1016 s.len() > 0,
1017 ensures
1018 #![trigger is_continuation_byte(s.last())]
1019 #![trigger is_leading_byte_width_1(s.last())]
1020 !is_continuation_byte(s.last()) ==> is_leading_byte_width_1(s.last()),
1021 decreases s.len(),
1022{
1023 let first = decode_first_scalar(s);
1026 let rest = pop_first_scalar(s);
1027
1028 if rest.len() == 0 {
1029 if s.len() > 1 {
1030 assert(is_continuation_byte(s[s.len() - 1]));
1031 }
1032 } else {
1033 valid_utf8_last(rest);
1034 }
1035}
1036
1037pub broadcast proof fn utf8_byte_ranges_bitwise(b: u8)
1041 by (bit_vector)
1042 ensures
1043 #![trigger b & 0x80]
1044 #![trigger b & 0xf0]
1045 #![trigger b & 0xf8]
1046 #![trigger b & 0xe0]
1047 #![trigger b & 0xc0]
1048 0x00 <= b <= 0x7f <==> b & 0x80 == 0,
1049 0xc0 <= b <= 0xdf <==> b & 0xe0 == 0xc0,
1050 0xe0 <= b <= 0xef <==> b & 0xf0 == 0xe0,
1051 0xf0 <= b <= 0xf7 <==> b & 0xf8 == 0xf0,
1052 0x80 <= b <= 0xbf <==> b & 0xc0 == 0x80,
1053{
1054}
1055
1056pub open spec fn is_ascii_chars(chars: Seq<char>) -> bool {
1060 forall|i| 0 <= i < chars.len() ==> '\0' <= #[trigger] chars[i] <= '\u{7f}'
1061}
1062
1063pub broadcast proof fn is_ascii_chars_encode_utf8(chars: Seq<char>)
1065 requires
1066 #[trigger] is_ascii_chars(chars),
1067 ensures
1068 chars.len() == encode_utf8(chars).len(),
1069 forall|i|
1070 #![trigger chars[i]]
1071 #![trigger encode_utf8(chars)[i]]
1072 0 <= i < chars.len() ==> chars[i] as u8 == encode_utf8(chars)[i],
1073 decreases chars.len(),
1074{
1075 if chars.len() == 0 {
1076 } else {
1077 let c0 = chars[0] as u32;
1078 assert(c0 as u8 == leading_byte_width_1(c0)) by (bit_vector)
1079 requires
1080 has_width_1_encoding(c0),
1081 ;
1082 is_ascii_chars_encode_utf8(chars.drop_first());
1083 }
1084}
1085
1086pub broadcast proof fn is_ascii_chars_nat_bound(chars: Seq<char>)
1088 ensures
1089 #[trigger] is_ascii_chars(chars) ==> forall|i: int|
1090 0 <= i < chars.len() ==> (chars.index(i) as nat) < 128,
1091{
1092}
1093
1094pub broadcast proof fn is_ascii_chars_concat(c1: Seq<char>, c2: Seq<char>, c3: Seq<char>)
1096 requires
1097 c1 =~= c2 + c3,
1098 ensures
1099 #![trigger c2 + c3, is_ascii_chars(c1), is_ascii_chars(c2), is_ascii_chars(c3)]
1100 is_ascii_chars(c1) <==> is_ascii_chars(c2) && is_ascii_chars(c3),
1101{
1102 if (is_ascii_chars(c1)) {
1103 assert(c2 =~= c1.subrange(0, c2.len() as int));
1104 assert(c3 =~= c1.subrange(c2.len() as int, c1.len() as int));
1105 }
1106}
1107
1108pub broadcast group group_utf8_lib {
1109 encode_utf8_valid_utf8,
1110 encode_utf8_decode_utf8,
1111 decode_utf8_encode_utf8,
1112 char_is_scalar,
1113 char_u32_cast,
1114 valid_utf8_concat,
1115 partial_valid_utf8_extend,
1116 partial_valid_utf8_extend_ascii_block,
1117 valid_utf8_split,
1118 decode_utf8_split,
1119 is_char_boundary_start_end_of_seq,
1120 is_char_boundary_iff_not_is_continuation_byte,
1121 is_char_boundary_iff_is_leading_byte,
1122 valid_utf8_last,
1123 utf8_byte_ranges_bitwise,
1124 is_ascii_chars_encode_utf8,
1125 is_ascii_chars_nat_bound,
1126 is_ascii_chars_concat,
1127}
1128
1129}