libidn 1.42
idna.c
Go to the documentation of this file.
1/* idna.c --- Prototypes for Internationalized Domain Name library.
2 Copyright (C) 2002-2024 Simon Josefsson
3
4 This file is part of GNU Libidn.
5
6 GNU Libidn is free software: you can redistribute it and/or
7 modify it under the terms of either:
8
9 * the GNU Lesser General Public License as published by the Free
10 Software Foundation; either version 3 of the License, or (at
11 your option) any later version.
12
13 or
14
15 * the GNU General Public License as published by the Free
16 Software Foundation; either version 2 of the License, or (at
17 your option) any later version.
18
19 or both in parallel, as here.
20
21 GNU Libidn is distributed in the hope that it will be useful,
22 but WITHOUT ANY WARRANTY; without even the implied warranty of
23 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
24 General Public License for more details.
25
26 You should have received copies of the GNU General Public License and
27 the GNU Lesser General Public License along with this program. If
28 not, see <https://www.gnu.org/licenses/>. */
29
30#ifdef HAVE_CONFIG_H
31# include "config.h"
32#endif
33
34#include <stdlib.h>
35#include <string.h>
36#include <stringprep.h>
37#include <punycode.h>
38
39#include "idna.h"
40
41/* Get c_strcasecmp. */
42#include <c-strcase.h>
43
44#define DOTP(c) ((c) == 0x002E || (c) == 0x3002 || \
45 (c) == 0xFF0E || (c) == 0xFF61)
46
47/* Core functions */
48
80int
81idna_to_ascii_4i (const uint32_t *in, size_t inlen, char *out, int flags)
82{
83 size_t len, outlen;
84 uint32_t *src; /* XXX don't need to copy data? */
85 int rc;
86
87 /*
88 * ToASCII consists of the following steps:
89 *
90 * 1. If all code points in the sequence are in the ASCII range (0..7F)
91 * then skip to step 3.
92 */
93
94 {
95 size_t i;
96 int inasciirange;
97
98 inasciirange = 1;
99 for (i = 0; i < inlen; i++)
100 if (in[i] > 0x7F)
101 inasciirange = 0;
102 if (inasciirange)
103 {
104 src = malloc (sizeof (in[0]) * (inlen + 1));
105 if (src == NULL)
106 return IDNA_MALLOC_ERROR;
107
108 memcpy (src, in, sizeof (in[0]) * inlen);
109 src[inlen] = 0;
110
111 goto step3;
112 }
113 }
114
115 /*
116 * 2. Perform the steps specified in [NAMEPREP] and fail if there is
117 * an error. The AllowUnassigned flag is used in [NAMEPREP].
118 */
119
120 {
121 char *p;
122
123 p = stringprep_ucs4_to_utf8 (in, (ssize_t) inlen, NULL, NULL);
124 if (p == NULL)
125 return IDNA_MALLOC_ERROR;
126
127 len = strlen (p);
128 do
129 {
130 char *newp;
131
132 len = 2 * len + 10; /* XXX better guess? */
133 newp = realloc (p, len);
134 if (newp == NULL)
135 {
136 free (p);
137 return IDNA_MALLOC_ERROR;
138 }
139 p = newp;
140
141 if (flags & IDNA_ALLOW_UNASSIGNED)
142 rc = stringprep_nameprep (p, len);
143 else
145 }
146 while (rc == STRINGPREP_TOO_SMALL_BUFFER);
147
148 if (rc != STRINGPREP_OK)
149 {
150 free (p);
152 }
153
154 src = stringprep_utf8_to_ucs4 (p, -1, NULL);
155
156 free (p);
157
158 if (!src)
159 return IDNA_MALLOC_ERROR;
160 }
161
162step3:
163 /*
164 * 3. If the UseSTD3ASCIIRules flag is set, then perform these checks:
165 *
166 * (a) Verify the absence of non-LDH ASCII code points; that is,
167 * the absence of 0..2C, 2E..2F, 3A..40, 5B..60, and 7B..7F.
168 *
169 * (b) Verify the absence of leading and trailing hyphen-minus;
170 * that is, the absence of U+002D at the beginning and end of
171 * the sequence.
172 */
173
174 if (flags & IDNA_USE_STD3_ASCII_RULES)
175 {
176 size_t i;
177
178 for (i = 0; src[i]; i++)
179 if (src[i] <= 0x2C || src[i] == 0x2E || src[i] == 0x2F ||
180 (src[i] >= 0x3A && src[i] <= 0x40) ||
181 (src[i] >= 0x5B && src[i] <= 0x60) ||
182 (src[i] >= 0x7B && src[i] <= 0x7F))
183 {
184 free (src);
186 }
187
188 if (src[0] == 0x002D || (i > 0 && src[i - 1] == 0x002D))
189 {
190 free (src);
191 return IDNA_CONTAINS_MINUS;
192 }
193 }
194
195 /*
196 * 4. If all code points in the sequence are in the ASCII range
197 * (0..7F), then skip to step 8.
198 */
199
200 {
201 size_t i;
202 int inasciirange;
203
204 inasciirange = 1;
205 for (i = 0; src[i]; i++)
206 {
207 if (src[i] > 0x7F)
208 inasciirange = 0;
209 /* copy string to output buffer if we are about to skip to step8 */
210 if (i < 64)
211 out[i] = src[i];
212 }
213 if (i < 64)
214 out[i] = '\0';
215 else
216 {
217 free (src);
218 return IDNA_INVALID_LENGTH;
219 }
220 if (inasciirange)
221 goto step8;
222 }
223
224 /*
225 * 5. Verify that the sequence does NOT begin with the ACE prefix.
226 *
227 */
228
229 {
230 size_t i;
231 int match;
232
233 match = 1;
234 for (i = 0; match && i < strlen (IDNA_ACE_PREFIX); i++)
235 if (((uint32_t) IDNA_ACE_PREFIX[i] & 0xFF) != src[i])
236 match = 0;
237 if (match)
238 {
239 free (src);
241 }
242 }
243
244 /*
245 * 6. Encode the sequence using the encoding algorithm in [PUNYCODE]
246 * and fail if there is an error.
247 */
248 for (len = 0; src[len]; len++)
249 ;
250 src[len] = '\0';
251 outlen = 63 - strlen (IDNA_ACE_PREFIX);
252 rc = punycode_encode (len, src, NULL,
253 &outlen, &out[strlen (IDNA_ACE_PREFIX)]);
254 if (rc != PUNYCODE_SUCCESS)
255 {
256 free (src);
257 return IDNA_PUNYCODE_ERROR;
258 }
259 out[strlen (IDNA_ACE_PREFIX) + outlen] = '\0';
260
261 /*
262 * 7. Prepend the ACE prefix.
263 */
264
265 memcpy (out, IDNA_ACE_PREFIX, strlen (IDNA_ACE_PREFIX));
266
267 /*
268 * 8. Verify that the number of code points is in the range 1 to 63
269 * inclusive (0 is excluded).
270 */
271
272step8:
273 free (src);
274 if (strlen (out) < 1)
275 return IDNA_INVALID_LENGTH;
276
277 return IDNA_SUCCESS;
278}
279
280/* ToUnicode(). May realloc() utf8in. Will free utf8in unconditionally. */
281static int
282idna_to_unicode_internal (char *utf8in,
283 uint32_t *out, size_t *outlen, int flags)
284{
285 int rc;
286 char tmpout[64];
287 size_t utf8len = strlen (utf8in) + 1;
288 size_t addlen = 0, addinc = utf8len / 10 + 1;
289
290 /*
291 * ToUnicode consists of the following steps:
292 *
293 * 1. If the sequence contains any code points outside the ASCII range
294 * (0..7F) then proceed to step 2, otherwise skip to step 3.
295 */
296
297 {
298 size_t i;
299 int inasciirange;
300
301 inasciirange = 1;
302 for (i = 0; utf8in[i]; i++)
303 if (utf8in[i] & ~0x7F)
304 inasciirange = 0;
305 if (inasciirange)
306 goto step3;
307 }
308
309 /*
310 * 2. Perform the steps specified in [NAMEPREP] and fail if there is an
311 * error. (If step 3 of ToASCII is also performed here, it will not
312 * affect the overall behavior of ToUnicode, but it is not
313 * necessary.) The AllowUnassigned flag is used in [NAMEPREP].
314 */
315 do
316 {
317 char *newp = realloc (utf8in, utf8len + addlen);
318 if (newp == NULL)
319 {
320 free (utf8in);
321 return IDNA_MALLOC_ERROR;
322 }
323 utf8in = newp;
324 if (flags & IDNA_ALLOW_UNASSIGNED)
325 rc = stringprep_nameprep (utf8in, utf8len + addlen);
326 else
327 rc = stringprep_nameprep_no_unassigned (utf8in, utf8len + addlen);
328 addlen += addinc;
329 addinc *= 2;
330 }
331 while (rc == STRINGPREP_TOO_SMALL_BUFFER);
332
333 if (rc != STRINGPREP_OK)
334 {
335 free (utf8in);
337 }
338
339 /* 3. Verify that the sequence begins with the ACE prefix, and save a
340 * copy of the sequence.
341 * ... The ToASCII and ToUnicode operations MUST recognize the ACE
342 prefix in a case-insensitive manner.
343 */
344
345step3:
346 if (c_strncasecmp (utf8in, IDNA_ACE_PREFIX, strlen (IDNA_ACE_PREFIX)) != 0)
347 {
348 free (utf8in);
349 return IDNA_NO_ACE_PREFIX;
350 }
351
352 /* 4. Remove the ACE prefix.
353 */
354
355 memmove (utf8in, &utf8in[strlen (IDNA_ACE_PREFIX)],
356 strlen (utf8in) - strlen (IDNA_ACE_PREFIX) + 1);
357
358 /* 5. Decode the sequence using the decoding algorithm in [PUNYCODE]
359 * and fail if there is an error. Save a copy of the result of
360 * this step.
361 */
362
363 (*outlen)--; /* reserve one for the zero */
364
365 rc = punycode_decode (strlen (utf8in), utf8in, outlen, out, NULL);
366 if (rc != PUNYCODE_SUCCESS)
367 {
368 free (utf8in);
369 return IDNA_PUNYCODE_ERROR;
370 }
371
372 out[*outlen] = 0; /* add zero */
373
374 /* 6. Apply ToASCII.
375 */
376
377 rc = idna_to_ascii_4i (out, *outlen, tmpout, flags);
378 if (rc != IDNA_SUCCESS)
379 {
380 free (utf8in);
381 return rc;
382 }
383
384 /* 7. Verify that the result of step 6 matches the saved copy from
385 * step 3, using a case-insensitive ASCII comparison.
386 */
387
388 if (c_strcasecmp (utf8in, tmpout + strlen (IDNA_ACE_PREFIX)) != 0)
389 {
390 free (utf8in);
392 }
393
394 /* 8. Return the saved copy from step 5.
395 */
396
397 free (utf8in);
398 return IDNA_SUCCESS;
399}
400
436int
437idna_to_unicode_44i (const uint32_t *in, size_t inlen,
438 uint32_t *out, size_t *outlen, int flags)
439{
440 int rc;
441 size_t outlensave = *outlen;
442 char *p;
443
444 p = stringprep_ucs4_to_utf8 (in, (ssize_t) inlen, NULL, NULL);
445 if (p == NULL)
446 return IDNA_MALLOC_ERROR;
447
448 rc = idna_to_unicode_internal (p, out, outlen, flags);
449 if (rc != IDNA_SUCCESS)
450 {
451 memcpy (out, in, sizeof (in[0]) * (inlen < outlensave ?
452 inlen : outlensave));
453 *outlen = inlen;
454 }
455
456 /* p is freed in idna_to_unicode_internal. */
457
458 return rc;
459}
460
461/* Wrappers that handle several labels */
462
476int
477idna_to_ascii_4z (const uint32_t *input, char **output, int flags)
478{
479 const uint32_t *start = input;
480 const uint32_t *end;
481 char buf[64];
482 char *out = NULL;
483 int rc;
484
485 /* 1) Whenever dots are used as label separators, the following
486 characters MUST be recognized as dots: U+002E (full stop),
487 U+3002 (ideographic full stop), U+FF0E (fullwidth full stop),
488 U+FF61 (halfwidth ideographic full stop). */
489
490 if (input[0] == 0)
491 {
492 /* Handle implicit zero-length root label. */
493 *output = malloc (1);
494 if (!*output)
495 return IDNA_MALLOC_ERROR;
496 strcpy (*output, "");
497 return IDNA_SUCCESS;
498 }
499
500 if (DOTP (input[0]) && input[1] == 0)
501 {
502 /* Handle explicit zero-length root label. */
503 *output = malloc (2);
504 if (!*output)
505 return IDNA_MALLOC_ERROR;
506 strcpy (*output, ".");
507 return IDNA_SUCCESS;
508 }
509
510 *output = NULL;
511 do
512 {
513 end = start;
514
515 for (; *end && !DOTP (*end); end++)
516 ;
517
518 if (*end == '\0' && start == end)
519 {
520 /* Handle explicit zero-length root label. */
521 buf[0] = '\0';
522 }
523 else
524 {
525 rc = idna_to_ascii_4i (start, (size_t) (end - start), buf, flags);
526 if (rc != IDNA_SUCCESS)
527 {
528 free (out);
529 return rc;
530 }
531 }
532
533 if (out)
534 {
535 size_t l = strlen (out) + 1 + strlen (buf) + 1;
536 char *newp = realloc (out, l);
537 if (!newp)
538 {
539 free (out);
540 return IDNA_MALLOC_ERROR;
541 }
542 out = newp;
543 strcat (out, ".");
544 strcat (out, buf);
545 }
546 else
547 {
548 out = strdup (buf);
549 if (!out)
550 return IDNA_MALLOC_ERROR;
551 }
552
553 start = end + 1;
554 }
555 while (*end);
556
557 *output = out;
558
559 return IDNA_SUCCESS;
560}
561
575int
576idna_to_ascii_8z (const char *input, char **output, int flags)
577{
578 uint32_t *ucs4;
579 size_t ucs4len;
580 int rc;
581
582 ucs4 = stringprep_utf8_to_ucs4 (input, -1, &ucs4len);
583 if (!ucs4)
584 return IDNA_ICONV_ERROR;
585
586 rc = idna_to_ascii_4z (ucs4, output, flags);
587
588 free (ucs4);
589
590 return rc;
591
592}
593
608int
609idna_to_ascii_lz (const char *input, char **output, int flags)
610{
611 char *utf8;
612 int rc;
613
614 utf8 = stringprep_locale_to_utf8 (input);
615 if (!utf8)
616 return IDNA_ICONV_ERROR;
617
618 rc = idna_to_ascii_8z (utf8, output, flags);
619
620 free (utf8);
621
622 return rc;
623}
624
639int
640idna_to_unicode_4z4z (const uint32_t *input, uint32_t **output, int flags)
641{
642 const uint32_t *start = input;
643 const uint32_t *end;
644 uint32_t *buf;
645 size_t buflen;
646 uint32_t *out = NULL;
647 size_t outlen = 0;
648
649 *output = NULL;
650
651 do
652 {
653 end = start;
654
655 for (; *end && !DOTP (*end); end++)
656 ;
657
658 buflen = (size_t) (end - start);
659 buf = malloc (sizeof (buf[0]) * (buflen + 1));
660 if (!buf)
661 {
662 free (out);
663 return IDNA_MALLOC_ERROR;
664 }
665
666 /* don't check return code as per specification! */
667 idna_to_unicode_44i (start, (size_t) (end - start),
668 buf, &buflen, flags);
669
670 if (out)
671 {
672 uint32_t *newp = realloc (out,
673 sizeof (out[0])
674 * (outlen + 1 + buflen + 1));
675 if (!newp)
676 {
677 free (buf);
678 free (out);
679 return IDNA_MALLOC_ERROR;
680 }
681 out = newp;
682 out[outlen++] = 0x002E; /* '.' (full stop) */
683 memcpy (out + outlen, buf, sizeof (buf[0]) * buflen);
684 outlen += buflen;
685 out[outlen] = 0x0;
686 free (buf);
687 }
688 else
689 {
690 out = buf;
691 outlen = buflen;
692 out[outlen] = 0x0;
693 }
694
695 start = end + 1;
696 }
697 while (*end);
698
699 *output = out;
700
701 return IDNA_SUCCESS;
702}
703
718int
719idna_to_unicode_8z4z (const char *input, uint32_t **output, int flags)
720{
721 uint32_t *ucs4;
722 size_t ucs4len;
723 int rc;
724
725 ucs4 = stringprep_utf8_to_ucs4 (input, -1, &ucs4len);
726 if (!ucs4)
727 return IDNA_ICONV_ERROR;
728
729 rc = idna_to_unicode_4z4z (ucs4, output, flags);
730 free (ucs4);
731
732 return rc;
733}
734
749int
750idna_to_unicode_8z8z (const char *input, char **output, int flags)
751{
752 uint32_t *ucs4;
753 int rc;
754
755 rc = idna_to_unicode_8z4z (input, &ucs4, flags);
756 if (rc != IDNA_SUCCESS)
757 return rc;
758
759 *output = stringprep_ucs4_to_utf8 (ucs4, -1, NULL, NULL);
760 free (ucs4);
761
762 if (!*output)
763 return IDNA_ICONV_ERROR;
764
765 return IDNA_SUCCESS;
766}
767
783int
784idna_to_unicode_8zlz (const char *input, char **output, int flags)
785{
786 char *utf8;
787 int rc;
788
789 rc = idna_to_unicode_8z8z (input, &utf8, flags);
790 if (rc != IDNA_SUCCESS)
791 return rc;
792
793 *output = stringprep_utf8_to_locale (utf8);
794 free (utf8);
795
796 if (!*output)
797 return IDNA_ICONV_ERROR;
798
799 return IDNA_SUCCESS;
800}
801
818int
819idna_to_unicode_lzlz (const char *input, char **output, int flags)
820{
821 char *utf8;
822 int rc;
823
824 utf8 = stringprep_locale_to_utf8 (input);
825 if (!utf8)
826 return IDNA_ICONV_ERROR;
827
828 rc = idna_to_unicode_8zlz (utf8, output, flags);
829 free (utf8);
830
831 return rc;
832}
833
int idna_to_unicode_8zlz(const char *input, char **output, int flags)
Definition idna.c:784
#define DOTP(c)
Definition idna.c:44
int idna_to_unicode_4z4z(const uint32_t *input, uint32_t **output, int flags)
Definition idna.c:640
int idna_to_ascii_8z(const char *input, char **output, int flags)
Definition idna.c:576
int idna_to_ascii_4z(const uint32_t *input, char **output, int flags)
Definition idna.c:477
int idna_to_unicode_lzlz(const char *input, char **output, int flags)
Definition idna.c:819
int idna_to_unicode_8z4z(const char *input, uint32_t **output, int flags)
Definition idna.c:719
int idna_to_unicode_44i(const uint32_t *in, size_t inlen, uint32_t *out, size_t *outlen, int flags)
Definition idna.c:437
int idna_to_unicode_8z8z(const char *input, char **output, int flags)
Definition idna.c:750
int idna_to_ascii_4i(const uint32_t *in, size_t inlen, char *out, int flags)
Definition idna.c:81
int idna_to_ascii_lz(const char *input, char **output, int flags)
Definition idna.c:609
@ IDNA_ROUNDTRIP_VERIFY_ERROR
Definition idna.h:83
@ IDNA_PUNYCODE_ERROR
Definition idna.h:76
@ IDNA_SUCCESS
Definition idna.h:74
@ IDNA_NO_ACE_PREFIX
Definition idna.h:82
@ IDNA_CONTAINS_MINUS
Definition idna.h:80
@ IDNA_ICONV_ERROR
Definition idna.h:85
@ IDNA_STRINGPREP_ERROR
Definition idna.h:75
@ IDNA_CONTAINS_ACE_PREFIX
Definition idna.h:84
@ IDNA_CONTAINS_NON_LDH
Definition idna.h:77
@ IDNA_INVALID_LENGTH
Definition idna.h:81
@ IDNA_MALLOC_ERROR
Definition idna.h:87
@ IDNA_USE_STD3_ASCII_RULES
Definition idna.h:95
@ IDNA_ALLOW_UNASSIGNED
Definition idna.h:94
#define IDNA_ACE_PREFIX
Definition idna.h:99
char * stringprep_ucs4_to_utf8(const uint32_t *str, ssize_t len, size_t *items_read, size_t *items_written)
Definition nfkc.c:1039
uint32_t * stringprep_utf8_to_ucs4(const char *str, ssize_t len, size_t *items_written)
Definition nfkc.c:1006
int punycode_decode(size_t input_length, const char input[], size_t *output_length, punycode_uint output[], unsigned char case_flags[])
Definition punycode.c:348
int punycode_encode(size_t input_length, const punycode_uint input[], const unsigned char case_flags[], size_t *output_length, char output[])
Definition punycode.c:196
@ PUNYCODE_SUCCESS
Definition punycode.h:110
@ STRINGPREP_TOO_SMALL_BUFFER
Definition stringprep.h:75
@ STRINGPREP_OK
Definition stringprep.h:67
IDNAPI char * stringprep_utf8_to_locale(const char *str)
Definition toutf8.c:161
#define stringprep_nameprep(in, maxlen)
Definition stringprep.h:202
IDNAPI char * stringprep_locale_to_utf8(const char *str)
Definition toutf8.c:145
#define stringprep_nameprep_no_unassigned(in, maxlen)
Definition stringprep.h:205