ccan/charset/test/run-utf8-read-write.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150

#include <ccan/charset/charset.c>
#include <ccan/tap/tap.h>

#include <string.h>

#include "common.h"

/*
 * Testing procedure for utf8_read_char and utf8_write_char:
 *
 *  * Generate N valid and invalid Unicode code points.
 *  * Encode them with utf8_write_char.
 *  * Copy the resulting string into a buffer sized exactly as big as
 *    the string produced.  This way, Valgrind can catch buffer overflows
 *    by utf8_validate and utf8_read_char.
 *  * Validate the string with utf8_validate.
 *  * Decode the string, ensuring that:
 *    - Valid codepoints are read back.
 *    - Invalid characters are read back, but replaced
 *      with REPLACEMENT_CHARACTER.
 *    - No extra characters are read back.
 */

#define TRIAL_COUNT             1000
#define MAX_CHARS_PER_TRIAL     100

#define range(r, lo, hi)  ((r) % ((hi)-(lo)+1) + (lo))

int main(void)
{
	int trial;
	
	plan_tests(TRIAL_COUNT);
	
	for (trial = 1; trial <= TRIAL_COUNT; trial++) {
		int i, count;
		uchar_t codepoints[MAX_CHARS_PER_TRIAL];
		uchar_t c;
		bool c_valid;
		
		char write_buffer[MAX_CHARS_PER_TRIAL * 4];
		char *o = write_buffer;
		char *oe = write_buffer + sizeof(write_buffer);
		
		char *string;
		const char *s;
		const char *e;
		
		int len;
		
		count = rand32() % MAX_CHARS_PER_TRIAL + 1;
		
		for (i = 0; i < count; i++) {
			if (o >= oe) {
				fail("utf8_write_char: Buffer overflow (1)");
				goto next_trial;
			}
			
			switch (rand32() % 7) {
				case 0:
					c = range(rand32(), 0x0, 0x7F);
					c_valid = true;
					break;
				case 1:
					c = range(rand32(), 0x80, 0x7FF);
					c_valid = true;
					break;
				case 2:
					c = range(rand32(), 0x800, 0xD7FF);
					c_valid = true;
					break;
				case 3:
					c = range(rand32(), 0xD800, 0xDFFF);
					c_valid = false;
					break;
				case 4:
					c = range(rand32(), 0xE000, 0xFFFF);
					c_valid = true;
					break;
				case 5:
					c = range(rand32(), 0x10000, 0x10FFFF);
					c_valid = true;
					break;
				default:
					do {
						c = rand32();
					} while (c < 0x110000);
					c_valid = false;
					break;
			}
			
			codepoints[i] = c_valid ? c : REPLACEMENT_CHARACTER;
			
			len = utf8_write_char(c, o);
			if (len < 1 || len > 4) {
				fail("utf8_write_char: Return value is not 1 thru 4.");
				goto next_trial;
			}
			o += len;
		}
		if (o > oe) {
			fail("utf8_write_char: Buffer overflow (2)");
			goto next_trial;
		}
		
		string = malloc(o - write_buffer);
		memcpy(string, write_buffer, o - write_buffer);
		s = string;
		e = string + (o - write_buffer);
		
		if (!utf8_validate(s, e - s)) {
			fail("Invalid string produced by utf8_write_char.");
			goto next_trial_free_string;
		}
		
		for (i = 0; i < count; i++) {
			if (s >= e) {
				fail("utf8_read_char: Buffer overflow (1)");
				goto next_trial_free_string;
			}
			
			len = utf8_read_char(s, &c);
			if (len < 1 || len > 4) {
				fail("utf8_read_char: Return value is not 1 thru 4.");
				goto next_trial_free_string;
			}
			if (c != codepoints[i]) {
				fail("utf8_read_char: Character read differs from that written.");
				goto next_trial_free_string;
			}
			s += len;
		}
		if (s > e) {
			fail("utf8_read_char: Buffer overflow (2)");
			goto next_trial_free_string;
		}
		if (s < e) {
			fail("utf8_read_char: Did not reach end of string.");
			goto next_trial_free_string;
		}
		
		pass("Trial %d: %d characters", trial, count);
		
	next_trial_free_string:
		free(string);
	next_trial:;
	}
	
	return exit_status();
}