#include #include #include #include #include #include #include "common.h" /* Make a valid or invalid Unicode character fitting in exactly @len UTF-8 bytes. */ static uchar_t utf8_randcode(int len, bool valid, bool after_clipped) { uint32_t r = rand32(); uchar_t ret; #define range(lo, hi) ((r & 0x7FFFFFFF) % ((hi)-(lo)+1) + (lo)) #define high_bit_set() (!!(r & 0x80000000)) switch (len) { case 1: if (valid) { /* Generate a character U+0000..U+007F */ return r & 0x7F; } else { /* * Generate a character U+0080..U+00BF or U+00F8..U+00FF. * * However, don't generate U+0080..U+00BF (10xxxxxx) after a * clipped character, as that can inadvertently form a valid, * complete character. */ if (!after_clipped && high_bit_set()) return range(0x80, 0xBF); else return range(0xF8, 0xFF); } case 2: if (valid) { /* Generate a character U+0080..U+07FF */ return range(0x80, 0x7FF); } else { /* Generate a character U+0000..U+007F */ return r & 0x7F; } case 3: if (valid) { /* Generate a character U+0800..U+FFFF, but not U+D800..U+DFFF */ for (;;) { ret = range(0x800, 0xFFFF); if (ret >= 0xD800 && ret <= 0xDFFF) { r = rand32(); continue; } else { break; } } return ret; } else { /* Generate a character U+0000..U+07FF or U+D800..U+DFFF */ if (high_bit_set()) return r & 0x7FF; else return 0xD800 + (r & 0x7FF); } case 4: if (valid) { /* Generate a character U+10000..U+10FFFF */ return range(0x10000, 0x10FFFF); } else { /* Generate a character U+0000..0xFFFF or U+110000..U+1FFFFF */ if (high_bit_set()) return r & 0xFFFF; else return range(0x110000, 0x1FFFFF); } default: assert(false); } #undef range #undef high_bit_set } /* Encode @uc as UTF-8 using exactly @len characters. @len should be 1 thru 4. */ static void utf8_encode_raw(char *out, unsigned int uc, int len) { switch (len) { case 1: assert(uc <= 0xC1 || (uc >= 0xF8 && uc <= 0xFF)); *out++ = uc; break; case 2: assert(uc <= 0x7FF); *out++ = 0xC0 | ((uc >> 6) & 0x1F); *out++ = 0x80 | (uc & 0x3F); break; case 3: assert(uc <= 0xFFFF); *out++ = 0xE0 | ((uc >> 12) & 0x0F); *out++ = 0x80 | ((uc >> 6) & 0x3F); *out++ = 0x80 | (uc & 0x3F); break; case 4: assert(uc <= 0x1FFFFF); *out++ = 0xF0 | ((uc >> 18) & 0x07); *out++ = 0x80 | ((uc >> 12) & 0x3F); *out++ = 0x80 | ((uc >> 6) & 0x3F); *out++ = 0x80 | (uc & 0x3F); break; } } #if COMPUTE_AVERAGE_LENGTH double total_averages; #endif /* Generate a UTF-8 string of the given byte length, randomly deciding if it should be valid or not. Return true if it's valid, false if it's not. */ static bool utf8_mktest(char *out, int len) { double pf; uint32_t pu; int n; bool valid = true; bool v; bool after_clipped = false; #if COMPUTE_AVERAGE_LENGTH int n_total = 0; int count = 0; #endif /* * Probability that, per character, it should be valid. * The goal is to make utf8_mktest as a whole * have a 50% chance of generating a valid string. * * The equation being solved is: * * p^n = 0.5 * * where p is the probability that each character is valid, * and n is the number of characters in the string. * * 2.384 is the approximate average length of each character, * so len/2.384 is about how many characters this string * is expected to contain. */ pf = pow(0.5, 2.384/len); /* Convert to uint32_t to test against rand32. */ pu = pf * 4294967295.0; for (;len > 0; len -= n, out += n) { v = rand32() <= pu; if (v) { /* Generate a valid character. */ n = rand32() % (len < 4 ? len : 4) + 1; utf8_encode_raw(out, utf8_randcode(n, true, after_clipped), n); after_clipped = false; } else if (rand32() % 5) { /* Generate an invalid character. */ n = rand32() % (len < 4 ? len : 4) + 1; utf8_encode_raw(out, utf8_randcode(n, false, after_clipped), n); after_clipped = false; } else { /* Generate a clipped but otherwise valid character. */ char tmp[4]; n = rand32() % 3 + 2; utf8_encode_raw(tmp, utf8_randcode(n, true, after_clipped), n); n -= rand32() % (n-1) + 1; if (n > len) n = len; assert(n >= 1 && n <= 3); memcpy(out, tmp, n); after_clipped = true; } if (!v) valid = false; #if COMPUTE_AVERAGE_LENGTH n_total += n; count++; #endif } #if COMPUTE_AVERAGE_LENGTH if (count > 0) total_averages += (double)n_total / count; #endif return valid; } static void test_utf8_validate(void) { char buffer[128]; int i; int len; bool valid; int passed=0, p_valid=0, p_invalid=0, total=0; int count; count = 100000; #if COMPUTE_AVERAGE_LENGTH total_averages = 0.0; #endif for (i=0; i count/10 && p_invalid > count/10, "Valid and invalid should be balanced"); #if COMPUTE_AVERAGE_LENGTH printf("Average character length: %f\n", total_averages / count); #endif } int main(void) { /* This is how many tests you plan to run */ plan_tests(2); test_utf8_validate(); /* This exits depending on whether all tests passed */ return exit_status(); }