libidn 1.42
punycode.c
Go to the documentation of this file.
1/* punycode.c --- Implementation of punycode used to ASCII encode IDN's.
2 Copyright (C) 2002-2024 Simon Josefsson
3
4 This file is part of GNU Libidn.
5
6 GNU Libidn is free software: you can redistribute it and/or
7 modify it under the terms of either:
8
9 * the GNU Lesser General Public License as published by the Free
10 Software Foundation; either version 3 of the License, or (at
11 your option) any later version.
12
13 or
14
15 * the GNU General Public License as published by the Free
16 Software Foundation; either version 2 of the License, or (at
17 your option) any later version.
18
19 or both in parallel, as here.
20
21 GNU Libidn is distributed in the hope that it will be useful,
22 but WITHOUT ANY WARRANTY; without even the implied warranty of
23 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
24 General Public License for more details.
25
26 You should have received copies of the GNU General Public License and
27 the GNU Lesser General Public License along with this program. If
28 not, see <https://www.gnu.org/licenses/>. */
29
30/*
31 * This file is derived from RFC 3492bis written by Adam M. Costello,
32 * downloaded from http://www.nicemice.net/idn/punycode-spec.gz on
33 * 2015-03-02 with SHA1 a966a8017f6be579d74a50a226accc7607c40133, a
34 * copy of which is stored in the GNU Libidn version controlled
35 * repository under doc/specification/punycode-spec.gz.
36 *
37 * The changes compared to Adam's file include: re-indentation, adding
38 * the license boilerplate and this comment, #include of config.h and
39 * punycode.h, adding GTK-DOC comments, changing the return code of
40 * punycode_encode and punycode_decode from enum to int, renaming the
41 * input_length_orig function input variable to input_length (and
42 * renaming the internal input_length variable to input_len) in
43 * punycode_encode.
44 *
45 * Adam's file contains the following:
46 *
47 * punycode-sample.c 2.0.0 (2004-Mar-21-Sun)
48 * http://www.nicemice.net/idn/
49 * Adam M. Costello
50 * http://www.nicemice.net/amc/
51 *
52 * This is ANSI C code (C89) implementing Punycode 1.0.x.
53 *
54 * Disclaimer and license: Regarding this entire document or any
55 * portion of it (including the pseudocode and C code), the author
56 * makes no guarantees and is not responsible for any damage resulting
57 * from its use. The author grants irrevocable permission to anyone
58 * to use, modify, and distribute it in any way that does not diminish
59 * the rights of anyone else to use, modify, and distribute it,
60 * provided that redistributed derivative works do not contain
61 * misleading author or version information. Derivative works need
62 * not be licensed under similar terms.
63 */
64
65#include <config.h>
66
67/**********************************************************/
68/* Implementation (would normally go in its own .c file): */
69
70#include <string.h>
71
72#include "punycode.h"
73
74/*** Bootstring parameters for Punycode ***/
75
76enum
77{ base = 36, tmin = 1, tmax = 26, skew = 38, damp = 700,
78 initial_bias = 72, initial_n = 0x80, delimiter = 0x2D
79};
80
81/* basic(cp) tests whether cp is a basic code point: */
82#define basic(cp) ((punycode_uint)(cp) < 0x80)
83
84/* delim(cp) tests whether cp is a delimiter: */
85#define delim(cp) ((cp) == delimiter)
86
87/* decode_digit(cp) returns the numeric value of a basic code */
88/* point (for use in representing integers) in the range 0 to */
89/* base-1, or base if cp does not represent a value. */
90
91static unsigned
92decode_digit (int cp)
93{
94 return (unsigned) (cp - 48 < 10 ? cp - 22 : cp - 65 < 26 ? cp - 65 :
95 cp - 97 < 26 ? cp - 97 : base);
96}
97
98/* encode_digit(d,flag) returns the basic code point whose value */
99/* (when used for representing integers) is d, which needs to be in */
100/* the range 0 to base-1. The lowercase form is used unless flag is */
101/* nonzero, in which case the uppercase form is used. The behavior */
102/* is undefined if flag is nonzero and digit d has no uppercase form. */
103
104static char
105encode_digit (punycode_uint d, int flag)
106{
107 return d + 22 + 75 * (d < 26) - ((flag != 0) << 5);
108 /* 0..25 map to ASCII a..z or A..Z */
109 /* 26..35 map to ASCII 0..9 */
110}
111
112/* flagged(bcp) tests whether a basic code point is flagged */
113/* (uppercase). The behavior is undefined if bcp is not a */
114/* basic code point. */
115
116#define flagged(bcp) ((punycode_uint)(bcp) - 65 < 26)
117
118/* encode_basic(bcp,flag) forces a basic code point to lowercase */
119/* if flag is zero, uppercase if flag is nonzero, and returns */
120/* the resulting code point. The code point is unchanged if it */
121/* is caseless. The behavior is undefined if bcp is not a basic */
122/* code point. */
123
124static char
125encode_basic (punycode_uint bcp, int flag)
126{
127 bcp -= (bcp - 97 < 26) << 5;
128 return bcp + ((!flag && (bcp - 65 < 26)) << 5);
129}
130
131/*** Platform-specific constants ***/
132
133/* maxint is the maximum value of a punycode_uint variable: */
134static const punycode_uint maxint = -1;
135/* Because maxint is unsigned, -1 becomes the maximum value. */
136
137/*** Bias adaptation function ***/
138
139static punycode_uint
140adapt (punycode_uint delta, punycode_uint numpoints, int firsttime)
141{
143
144 delta = firsttime ? delta / damp : delta >> 1;
145 /* delta >> 1 is a faster way of doing delta / 2 */
146 delta += delta / numpoints;
147
148 for (k = 0; delta > ((base - tmin) * tmax) / 2; k += base)
149 {
150 delta /= base - tmin;
151 }
152
153 return k + (base - tmin + 1) * delta / (delta + skew);
154}
155
156/*** Main encode function ***/
157
195int
196punycode_encode (size_t input_length,
197 const punycode_uint input[],
198 const unsigned char case_flags[],
199 size_t *output_length, char output[])
200{
201 punycode_uint input_len, n, delta, h, b, bias, j, m, q, k, t;
202 size_t out, max_out;
203
204 /* The Punycode spec assumes that the input length is the same type */
205 /* of integer as a code point, so we need to convert the size_t to */
206 /* a punycode_uint, which could overflow. */
207
208 if (input_length > maxint)
209 return punycode_overflow;
210 input_len = (punycode_uint) input_length;
211
212 /* Initialize the state: */
213
214 n = initial_n;
215 delta = 0;
216 out = 0;
217 max_out = *output_length;
218 bias = initial_bias;
219
220 /* Handle the basic code points: */
221
222 for (j = 0; j < input_len; ++j)
223 {
224 if (basic (input[j]))
225 {
226 if (max_out - out < 2)
227 return punycode_big_output;
228 output[out++] = case_flags ?
229 encode_basic (input[j], case_flags[j]) : (char) input[j];
230 }
231 else if (input[j] > 0x10FFFF
232 || (input[j] >= 0xD800 && input[j] <= 0xDBFF))
233 return punycode_bad_input;
234 /* else if (input[j] < n) return punycode_bad_input; */
235 /* (not needed for Punycode with unsigned code points) */
236 }
237
238 h = b = (punycode_uint) out;
239 /* cannot overflow because out <= input_len <= maxint */
240
241 /* h is the number of code points that have been handled, b is the */
242 /* number of basic code points, and out is the number of ASCII code */
243 /* points that have been output. */
244
245 if (b > 0)
246 output[out++] = delimiter;
247
248 /* Main encoding loop: */
249
250 while (h < input_len)
251 {
252 /* All non-basic code points < n have been */
253 /* handled already. Find the next larger one: */
254
255 for (m = maxint, j = 0; j < input_len; ++j)
256 {
257 /* if (basic(input[j])) continue; */
258 /* (not needed for Punycode) */
259 if (input[j] >= n && input[j] < m)
260 m = input[j];
261 }
262
263 /* Increase delta enough to advance the decoder's */
264 /* <n,i> state to <m,0>, but guard against overflow: */
265
266 if (m - n > (maxint - delta) / (h + 1))
267 return punycode_overflow;
268 delta += (m - n) * (h + 1);
269 n = m;
270
271 for (j = 0; j < input_len; ++j)
272 {
273 /* Punycode does not need to check whether input[j] is basic: */
274 if (input[j] < n /* || basic(input[j]) */ )
275 {
276 if (++delta == 0)
277 return punycode_overflow;
278 }
279
280 if (input[j] == n)
281 {
282 /* Represent delta as a generalized variable-length integer: */
283
284 for (q = delta, k = base;; k += base)
285 {
286 if (out >= max_out)
287 return punycode_big_output;
288 t = k <= bias /* + tmin */ ? tmin : /* +tmin not needed */
289 k >= bias + tmax ? tmax : k - bias;
290 if (q < t)
291 break;
292 output[out++] = encode_digit (t + (q - t) % (base - t), 0);
293 q = (q - t) / (base - t);
294 }
295
296 output[out++] = encode_digit (q, case_flags && case_flags[j]);
297 bias = adapt (delta, h + 1, h == b);
298 delta = 0;
299 ++h;
300 }
301 }
302
303 ++delta, ++n;
304 }
305
306 *output_length = out;
307 return punycode_success;
308}
309
310/*** Main decode function ***/
311
347int
348punycode_decode (size_t input_length,
349 const char input[],
350 size_t *output_length,
351 punycode_uint output[], unsigned char case_flags[])
352{
353 punycode_uint n, out, i, max_out, bias, oldi, w, k, digit, t;
354 size_t b, j, in;
355
356 /* Initialize the state: */
357
358 n = initial_n;
359 out = i = 0;
360 max_out = *output_length > maxint ? maxint
361 : (punycode_uint) * output_length;
362 bias = initial_bias;
363
364 /* Handle the basic code points: Let b be the number of input code */
365 /* points before the last delimiter, or 0 if there is none, then */
366 /* copy the first b code points to the output. */
367
368 for (b = j = 0; j < input_length; ++j)
369 if (delim (input[j]))
370 b = j;
371 if (b > max_out)
372 return punycode_big_output;
373
374 for (j = 0; j < b; ++j)
375 {
376 if (case_flags)
377 case_flags[out] = flagged (input[j]);
378 if (!basic (input[j]))
379 return punycode_bad_input;
380 output[out++] = input[j];
381 }
382 for (j = b + (b > 0); j < input_length; ++j)
383 if (!basic (input[j]))
384 return punycode_bad_input;
385
386 /* Main decoding loop: Start just after the last delimiter if any */
387 /* basic code points were copied; start at the beginning otherwise. */
388
389 for (in = b > 0 ? b + 1 : 0; in < input_length; ++out)
390 {
391
392 /* in is the index of the next ASCII code point to be consumed, */
393 /* and out is the number of code points in the output array. */
394
395 /* Decode a generalized variable-length integer into delta, */
396 /* which gets added to i. The overflow checking is easier */
397 /* if we increase i as we go, then subtract off its starting */
398 /* value at the end to obtain delta. */
399
400 for (oldi = i, w = 1, k = base;; k += base)
401 {
402 if (in >= input_length)
403 return punycode_bad_input;
404 digit = decode_digit (input[in++]);
405 if (digit >= base)
406 return punycode_bad_input;
407 if (digit > (maxint - i) / w)
408 return punycode_overflow;
409 i += digit * w;
410 t = k <= bias /* + tmin */ ? tmin : /* +tmin not needed */
411 k >= bias + tmax ? tmax : k - bias;
412 if (digit < t)
413 break;
414 if (w > maxint / (base - t))
415 return punycode_overflow;
416 w *= (base - t);
417 }
418
419 bias = adapt (i - oldi, out + 1, oldi == 0);
420
421 /* i was supposed to wrap around from out+1 to 0, */
422 /* incrementing n each time, so we'll fix that now: */
423
424 if (i / (out + 1) > maxint - n)
425 return punycode_overflow;
426 n += i / (out + 1);
427 if (n > 0x10FFFF || (n >= 0xD800 && n <= 0xDBFF))
428 return punycode_bad_input;
429 i %= (out + 1);
430
431 /* Insert n at position i of the output: */
432
433 /* not needed for Punycode: */
434 /* if (basic(n)) return punycode_bad_input; */
435 if (out >= max_out)
436 return punycode_big_output;
437
438 if (case_flags)
439 {
440 memmove (case_flags + i + 1, case_flags + i, out - i);
441 /* Case of last ASCII code point determines case flag: */
442 case_flags[i] = flagged (input[in - 1]);
443 }
444
445 memmove (output + i + 1, output + i, (out - i) * sizeof *output);
446 output[i++] = n;
447 }
448
449 *output_length = (size_t) out;
450 /* cannot overflow because out <= old value of *output_length */
451 return punycode_success;
452}
453
@ base
Definition punycode.c:77
@ tmax
Definition punycode.c:77
@ tmin
Definition punycode.c:77
@ initial_bias
Definition punycode.c:78
@ damp
Definition punycode.c:77
@ skew
Definition punycode.c:77
@ delimiter
Definition punycode.c:78
@ initial_n
Definition punycode.c:78
#define flagged(bcp)
Definition punycode.c:116
#define basic(cp)
Definition punycode.c:82
int punycode_decode(size_t input_length, const char input[], size_t *output_length, punycode_uint output[], unsigned char case_flags[])
Definition punycode.c:348
int punycode_encode(size_t input_length, const punycode_uint input[], const unsigned char case_flags[], size_t *output_length, char output[])
Definition punycode.c:196
#define delim(cp)
Definition punycode.c:85
@ punycode_bad_input
Definition punycode.h:103
@ punycode_success
Definition punycode.h:102
@ punycode_overflow
Definition punycode.h:105
@ punycode_big_output
Definition punycode.h:104
uint32_t punycode_uint
Definition punycode.h:123