1/* Copyright (C) 1996-2022 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published
6 by the Free Software Foundation; version 2 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, see <https://www.gnu.org/licenses/>. */
16
17#ifdef HAVE_CONFIG_H
18# include <config.h>
19#endif
20
21#include <assert.h>
22#include <ctype.h>
23#include <errno.h>
24#include <libintl.h>
25#include <stdarg.h>
26#include <stdlib.h>
27#include <string.h>
28#include <stdint.h>
29
30#include "localedef.h"
31#include "charmap.h"
32#include "error.h"
33#include "linereader.h"
34#include "locfile.h"
35
36/* Prototypes for local functions. */
37static struct token *get_toplvl_escape (struct linereader *lr);
38static struct token *get_symname (struct linereader *lr);
39static struct token *get_ident (struct linereader *lr);
40static struct token *get_string (struct linereader *lr,
41 const struct charmap_t *charmap,
42 struct localedef_t *locale,
43 const struct repertoire_t *repertoire,
44 int verbose);
45
46
47struct linereader *
48lr_open (const char *fname, kw_hash_fct_t hf)
49{
50 FILE *fp;
51
52 if (fname == NULL || strcmp (s1: fname, s2: "-") == 0
53 || strcmp (s1: fname, s2: "/dev/stdin") == 0)
54 return lr_create (stdin, fname: "<stdin>", hf);
55 else
56 {
57 fp = fopen (filename: fname, modes: "rm");
58 if (fp == NULL)
59 return NULL;
60 return lr_create (fp, fname, hf);
61 }
62}
63
64struct linereader *
65lr_create (FILE *fp, const char *fname, kw_hash_fct_t hf)
66{
67 struct linereader *result;
68 int n;
69
70 result = (struct linereader *) xmalloc (n: sizeof (*result));
71
72 result->fp = fp;
73 result->fname = xstrdup (fname);
74 result->buf = NULL;
75 result->bufsize = 0;
76 result->lineno = 1;
77 result->idx = 0;
78 result->comment_char = '#';
79 result->escape_char = '\\';
80 result->translate_strings = 1;
81 result->return_widestr = 0;
82
83 n = getdelim (lineptr: &result->buf, n: &result->bufsize, delimiter: '\n', stream: result->fp);
84 if (n < 0)
85 {
86 int save = errno;
87 fclose (stream: result->fp);
88 free (ptr: (char *) result->fname);
89 free (ptr: result);
90 errno = save;
91 return NULL;
92 }
93
94 if (n > 1 && result->buf[n - 2] == '\\' && result->buf[n - 1] == '\n')
95 n -= 2;
96
97 result->buf[n] = '\0';
98 result->bufact = n;
99 result->hash_fct = hf;
100
101 return result;
102}
103
104
105int
106lr_eof (struct linereader *lr)
107{
108 return lr->bufact = 0;
109}
110
111
112void
113lr_ignore_rest (struct linereader *lr, int verbose)
114{
115 if (verbose)
116 {
117 while (isspace (lr->buf[lr->idx]) && lr->buf[lr->idx] != '\n'
118 && lr->buf[lr->idx] != lr->comment_char)
119 if (lr->buf[lr->idx] == '\0')
120 {
121 if (lr_next (lr) < 0)
122 return;
123 }
124 else
125 ++lr->idx;
126
127 if (lr->buf[lr->idx] != '\n' && ! feof (stream: lr->fp)
128 && lr->buf[lr->idx] != lr->comment_char)
129 lr_error (lr, _("trailing garbage at end of line"));
130 }
131
132 /* Ignore continued line. */
133 while (lr->bufact > 0 && lr->buf[lr->bufact - 1] != '\n')
134 if (lr_next (lr) < 0)
135 break;
136
137 lr->idx = lr->bufact;
138}
139
140
141void
142lr_close (struct linereader *lr)
143{
144 fclose (stream: lr->fp);
145 free (ptr: lr->buf);
146 free (ptr: lr);
147}
148
149
150int
151lr_next (struct linereader *lr)
152{
153 int n;
154
155 n = getdelim (lineptr: &lr->buf, n: &lr->bufsize, delimiter: '\n', stream: lr->fp);
156 if (n < 0)
157 return -1;
158
159 ++lr->lineno;
160
161 if (n > 1 && lr->buf[n - 2] == lr->escape_char && lr->buf[n - 1] == '\n')
162 {
163#if 0
164 /* XXX Is this correct? */
165 /* An escaped newline character is substituted with a single <SP>. */
166 --n;
167 lr->buf[n - 1] = ' ';
168#else
169 n -= 2;
170#endif
171 }
172
173 lr->buf[n] = '\0';
174 lr->bufact = n;
175 lr->idx = 0;
176
177 return 0;
178}
179
180
181/* Defined in error.c. */
182/* This variable is incremented each time `error' is called. */
183extern unsigned int error_message_count;
184
185/* The calling program should define program_name and set it to the
186 name of the executing program. */
187extern char *program_name;
188
189
190struct token *
191lr_token (struct linereader *lr, const struct charmap_t *charmap,
192 struct localedef_t *locale, const struct repertoire_t *repertoire,
193 int verbose)
194{
195 int ch;
196
197 while (1)
198 {
199 do
200 {
201 ch = lr_getc (lr);
202
203 if (ch == EOF)
204 {
205 lr->token.tok = tok_eof;
206 return &lr->token;
207 };
208
209 if (ch == '\n')
210 {
211 lr->token.tok = tok_eol;
212 return &lr->token;
213 }
214 }
215 while (isspace (ch));
216
217 if (ch != lr->comment_char)
218 break;
219
220 /* Is there an newline at the end of the buffer? */
221 if (lr->buf[lr->bufact - 1] != '\n')
222 {
223 /* No. Some people want this to mean that only the line in
224 the file not the logical, concatenated line is ignored.
225 Let's try this. */
226 lr->idx = lr->bufact;
227 continue;
228 }
229
230 /* Ignore rest of line. */
231 lr_ignore_rest (lr, verbose: 0);
232 lr->token.tok = tok_eol;
233 return &lr->token;
234 }
235
236 /* Match escape sequences. */
237 if (ch == lr->escape_char)
238 return get_toplvl_escape (lr);
239
240 /* Match ellipsis. */
241 if (ch == '.')
242 {
243 if (strncmp (s1: &lr->buf[lr->idx], s2: "...(2)....", n: 10) == 0)
244 {
245 int cnt;
246 for (cnt = 0; cnt < 10; ++cnt)
247 lr_getc (lr);
248 lr->token.tok = tok_ellipsis4_2;
249 return &lr->token;
250 }
251 if (strncmp (s1: &lr->buf[lr->idx], s2: "...", n: 3) == 0)
252 {
253 lr_getc (lr);
254 lr_getc (lr);
255 lr_getc (lr);
256 lr->token.tok = tok_ellipsis4;
257 return &lr->token;
258 }
259 if (strncmp (s1: &lr->buf[lr->idx], s2: "..", n: 2) == 0)
260 {
261 lr_getc (lr);
262 lr_getc (lr);
263 lr->token.tok = tok_ellipsis3;
264 return &lr->token;
265 }
266 if (strncmp (s1: &lr->buf[lr->idx], s2: ".(2)..", n: 6) == 0)
267 {
268 int cnt;
269 for (cnt = 0; cnt < 6; ++cnt)
270 lr_getc (lr);
271 lr->token.tok = tok_ellipsis2_2;
272 return &lr->token;
273 }
274 if (lr->buf[lr->idx] == '.')
275 {
276 lr_getc (lr);
277 lr->token.tok = tok_ellipsis2;
278 return &lr->token;
279 }
280 }
281
282 switch (ch)
283 {
284 case '<':
285 return get_symname (lr);
286
287 case '0' ... '9':
288 lr->token.tok = tok_number;
289 lr->token.val.num = ch - '0';
290
291 while (isdigit (ch = lr_getc (lr)))
292 {
293 lr->token.val.num *= 10;
294 lr->token.val.num += ch - '0';
295 }
296 if (isalpha (ch))
297 lr_error (lr, _("garbage at end of number"));
298 lr_ungetn (lr, n: 1);
299
300 return &lr->token;
301
302 case ';':
303 lr->token.tok = tok_semicolon;
304 return &lr->token;
305
306 case ',':
307 lr->token.tok = tok_comma;
308 return &lr->token;
309
310 case '(':
311 lr->token.tok = tok_open_brace;
312 return &lr->token;
313
314 case ')':
315 lr->token.tok = tok_close_brace;
316 return &lr->token;
317
318 case '"':
319 return get_string (lr, charmap, locale, repertoire, verbose);
320
321 case '-':
322 ch = lr_getc (lr);
323 if (ch == '1')
324 {
325 lr->token.tok = tok_minus1;
326 return &lr->token;
327 }
328 lr_ungetn (lr, n: 2);
329 break;
330 }
331
332 return get_ident (lr);
333}
334
335
336static struct token *
337get_toplvl_escape (struct linereader *lr)
338{
339 /* This is supposed to be a numeric value. We return the
340 numerical value and the number of bytes. */
341 size_t start_idx = lr->idx - 1;
342 unsigned char *bytes = lr->token.val.charcode.bytes;
343 size_t nbytes = 0;
344 int ch;
345
346 do
347 {
348 unsigned int byte = 0;
349 unsigned int base = 8;
350
351 ch = lr_getc (lr);
352
353 if (ch == 'd')
354 {
355 base = 10;
356 ch = lr_getc (lr);
357 }
358 else if (ch == 'x')
359 {
360 base = 16;
361 ch = lr_getc (lr);
362 }
363
364 if ((base == 16 && !isxdigit (ch))
365 || (base != 16 && (ch < '0' || ch >= (int) ('0' + base))))
366 {
367 esc_error:
368 lr->token.val.str.startmb = &lr->buf[start_idx];
369
370 while (ch != EOF && !isspace (ch))
371 ch = lr_getc (lr);
372 lr->token.val.str.lenmb = lr->idx - start_idx;
373
374 lr->token.tok = tok_error;
375 return &lr->token;
376 }
377
378 if (isdigit (ch))
379 byte = ch - '0';
380 else
381 byte = tolower (ch) - 'a' + 10;
382
383 ch = lr_getc (lr);
384 if ((base == 16 && !isxdigit (ch))
385 || (base != 16 && (ch < '0' || ch >= (int) ('0' + base))))
386 goto esc_error;
387
388 byte *= base;
389 if (isdigit (ch))
390 byte += ch - '0';
391 else
392 byte += tolower (ch) - 'a' + 10;
393
394 ch = lr_getc (lr);
395 if (base != 16 && isdigit (ch))
396 {
397 byte *= base;
398 byte += ch - '0';
399
400 ch = lr_getc (lr);
401 }
402
403 bytes[nbytes++] = byte;
404 }
405 while (ch == lr->escape_char
406 && nbytes < (int) sizeof (lr->token.val.charcode.bytes));
407
408 if (!isspace (ch))
409 lr_error (lr, _("garbage at end of character code specification"));
410
411 lr_ungetn (lr, n: 1);
412
413 lr->token.tok = tok_charcode;
414 lr->token.val.charcode.nbytes = nbytes;
415
416 return &lr->token;
417}
418
419
420#define ADDC(ch) \
421 do \
422 { \
423 if (bufact == bufmax) \
424 { \
425 bufmax *= 2; \
426 buf = xrealloc (buf, bufmax); \
427 } \
428 buf[bufact++] = (ch); \
429 } \
430 while (0)
431
432
433#define ADDS(s, l) \
434 do \
435 { \
436 size_t _l = (l); \
437 if (bufact + _l > bufmax) \
438 { \
439 if (bufact < _l) \
440 bufact = _l; \
441 bufmax *= 2; \
442 buf = xrealloc (buf, bufmax); \
443 } \
444 memcpy (&buf[bufact], s, _l); \
445 bufact += _l; \
446 } \
447 while (0)
448
449
450#define ADDWC(ch) \
451 do \
452 { \
453 if (buf2act == buf2max) \
454 { \
455 buf2max *= 2; \
456 buf2 = xrealloc (buf2, buf2max * 4); \
457 } \
458 buf2[buf2act++] = (ch); \
459 } \
460 while (0)
461
462
463static struct token *
464get_symname (struct linereader *lr)
465{
466 /* Symbol in brackets. We must distinguish three kinds:
467 1. reserved words
468 2. ISO 10646 position values
469 3. all other. */
470 char *buf;
471 size_t bufact = 0;
472 size_t bufmax = 56;
473 const struct keyword_t *kw;
474 int ch;
475
476 buf = (char *) xmalloc (n: bufmax);
477
478 do
479 {
480 ch = lr_getc (lr);
481 if (ch == lr->escape_char)
482 {
483 int c2 = lr_getc (lr);
484 ADDC (c2);
485
486 if (c2 == '\n')
487 ch = '\n';
488 }
489 else
490 ADDC (ch);
491 }
492 while (ch != '>' && ch != '\n');
493
494 if (ch == '\n')
495 lr_error (lr, _("unterminated symbolic name"));
496
497 /* Test for ISO 10646 position value. */
498 if (buf[0] == 'U' && (bufact == 6 || bufact == 10))
499 {
500 char *cp = buf + 1;
501 while (cp < &buf[bufact - 1] && isxdigit (*cp))
502 ++cp;
503
504 if (cp == &buf[bufact - 1])
505 {
506 /* Yes, it is. */
507 lr->token.tok = tok_ucs4;
508 lr->token.val.ucs4 = strtoul (nptr: buf + 1, NULL, base: 16);
509
510 return &lr->token;
511 }
512 }
513
514 /* It is a symbolic name. Test for reserved words. */
515 kw = lr->hash_fct (buf, bufact - 1);
516
517 if (kw != NULL && kw->symname_or_ident == 1)
518 {
519 lr->token.tok = kw->token;
520 free (ptr: buf);
521 }
522 else
523 {
524 lr->token.tok = tok_bsymbol;
525
526 buf = xrealloc (o: buf, n: bufact + 1);
527 buf[bufact] = '\0';
528
529 lr->token.val.str.startmb = buf;
530 lr->token.val.str.lenmb = bufact - 1;
531 }
532
533 return &lr->token;
534}
535
536
537static struct token *
538get_ident (struct linereader *lr)
539{
540 char *buf;
541 size_t bufact;
542 size_t bufmax = 56;
543 const struct keyword_t *kw;
544 int ch;
545
546 buf = xmalloc (n: bufmax);
547 bufact = 0;
548
549 ADDC (lr->buf[lr->idx - 1]);
550
551 while (!isspace ((ch = lr_getc (lr))) && ch != '"' && ch != ';'
552 && ch != '<' && ch != ',' && ch != EOF)
553 {
554 if (ch == lr->escape_char)
555 {
556 ch = lr_getc (lr);
557 if (ch == '\n' || ch == EOF)
558 {
559 lr_error (lr, _("invalid escape sequence"));
560 break;
561 }
562 }
563 ADDC (ch);
564 }
565
566 lr_ungetc (lr, ch);
567
568 kw = lr->hash_fct (buf, bufact);
569
570 if (kw != NULL && kw->symname_or_ident == 0)
571 {
572 lr->token.tok = kw->token;
573 free (ptr: buf);
574 }
575 else
576 {
577 lr->token.tok = tok_ident;
578
579 buf = xrealloc (o: buf, n: bufact + 1);
580 buf[bufact] = '\0';
581
582 lr->token.val.str.startmb = buf;
583 lr->token.val.str.lenmb = bufact;
584 }
585
586 return &lr->token;
587}
588
589
590static struct token *
591get_string (struct linereader *lr, const struct charmap_t *charmap,
592 struct localedef_t *locale, const struct repertoire_t *repertoire,
593 int verbose)
594{
595 int return_widestr = lr->return_widestr;
596 char *buf;
597 wchar_t *buf2 = NULL;
598 size_t bufact;
599 size_t bufmax = 56;
600
601 /* We must return two different strings. */
602 buf = xmalloc (n: bufmax);
603 bufact = 0;
604
605 /* We know it'll be a string. */
606 lr->token.tok = tok_string;
607
608 /* If we need not translate the strings (i.e., expand <...> parts)
609 we can run a simple loop. */
610 if (!lr->translate_strings)
611 {
612 int ch;
613
614 buf2 = NULL;
615 while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
616 ADDC (ch);
617
618 /* Catch errors with trailing escape character. */
619 if (bufact > 0 && buf[bufact - 1] == lr->escape_char
620 && (bufact == 1 || buf[bufact - 2] != lr->escape_char))
621 {
622 lr_error (lr, _("illegal escape sequence at end of string"));
623 --bufact;
624 }
625 else if (ch == '\n' || ch == EOF)
626 lr_error (lr, _("unterminated string"));
627
628 ADDC ('\0');
629 }
630 else
631 {
632 int illegal_string = 0;
633 size_t buf2act = 0;
634 size_t buf2max = 56 * sizeof (uint32_t);
635 int ch;
636
637 /* We have to provide the wide character result as well. */
638 if (return_widestr)
639 buf2 = xmalloc (n: buf2max);
640
641 /* Read until the end of the string (or end of the line or file). */
642 while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
643 {
644 size_t startidx;
645 uint32_t wch;
646 struct charseq *seq;
647
648 if (ch != '<')
649 {
650 /* The standards leave it up to the implementation to decide
651 what to do with character which stand for themself. We
652 could jump through hoops to find out the value relative to
653 the charmap and the repertoire map, but instead we leave
654 it up to the locale definition author to write a better
655 definition. We assume here that every character which
656 stands for itself is encoded using ISO 8859-1. Using the
657 escape character is allowed. */
658 if (ch == lr->escape_char)
659 {
660 ch = lr_getc (lr);
661 if (ch == '\n' || ch == EOF)
662 break;
663 }
664
665 ADDC (ch);
666 if (return_widestr)
667 ADDWC ((uint32_t) ch);
668
669 continue;
670 }
671
672 /* Now we have to search for the end of the symbolic name, i.e.,
673 the closing '>'. */
674 startidx = bufact;
675 while ((ch = lr_getc (lr)) != '>' && ch != '\n' && ch != EOF)
676 {
677 if (ch == lr->escape_char)
678 {
679 ch = lr_getc (lr);
680 if (ch == '\n' || ch == EOF)
681 break;
682 }
683 ADDC (ch);
684 }
685 if (ch == '\n' || ch == EOF)
686 /* Not a correct string. */
687 break;
688 if (bufact == startidx)
689 {
690 /* <> is no correct name. Ignore it and also signal an
691 error. */
692 illegal_string = 1;
693 continue;
694 }
695
696 /* It might be a Uxxxx symbol. */
697 if (buf[startidx] == 'U'
698 && (bufact - startidx == 5 || bufact - startidx == 9))
699 {
700 char *cp = buf + startidx + 1;
701 while (cp < &buf[bufact] && isxdigit (*cp))
702 ++cp;
703
704 if (cp == &buf[bufact])
705 {
706 char utmp[10];
707
708 /* Yes, it is. */
709 ADDC ('\0');
710 wch = strtoul (nptr: buf + startidx + 1, NULL, base: 16);
711
712 /* Now forget about the name we just added. */
713 bufact = startidx;
714
715 if (return_widestr)
716 ADDWC (wch);
717
718 /* See whether the charmap contains the Uxxxxxxxx names. */
719 snprintf (s: utmp, maxlen: sizeof (utmp), format: "U%08X", wch);
720 seq = charmap_find_value (charmap, name: utmp, len: 9);
721
722 if (seq == NULL)
723 {
724 /* No, this isn't the case. Now determine from
725 the repertoire the name of the character and
726 find it in the charmap. */
727 if (repertoire != NULL)
728 {
729 const char *symbol;
730
731 symbol = repertoire_find_symbol (repertoire, ucs: wch);
732
733 if (symbol != NULL)
734 seq = charmap_find_value (charmap, name: symbol,
735 len: strlen (s: symbol));
736 }
737
738 if (seq == NULL)
739 {
740#ifndef NO_TRANSLITERATION
741 /* Transliterate if possible. */
742 if (locale != NULL)
743 {
744 uint32_t *translit;
745
746 if ((locale->avail & CTYPE_LOCALE) == 0)
747 {
748 /* Load the CTYPE data now. */
749 int old_needed = locale->needed;
750
751 locale->needed = 0;
752 locale = load_locale (LC_CTYPE,
753 locale->name,
754 locale->repertoire_name,
755 charmap, locale);
756 locale->needed = old_needed;
757 }
758
759 if ((locale->avail & CTYPE_LOCALE) != 0
760 && ((translit = find_translit (locale,
761 charmap, wch))
762 != NULL))
763 /* The CTYPE data contains a matching
764 transliteration. */
765 {
766 int i;
767
768 for (i = 0; translit[i] != 0; ++i)
769 {
770 char utmp[10];
771
772 snprintf (utmp, sizeof (utmp), "U%08X",
773 translit[i]);
774 seq = charmap_find_value (charmap, utmp,
775 9);
776 assert (seq != NULL);
777 ADDS (seq->bytes, seq->nbytes);
778 }
779
780 continue;
781 }
782 }
783#endif /* NO_TRANSLITERATION */
784
785 /* Not a known name. */
786 illegal_string = 1;
787 }
788 }
789
790 if (seq != NULL)
791 ADDS (seq->bytes, seq->nbytes);
792
793 continue;
794 }
795 }
796
797 /* We now have the symbolic name in buf[startidx] to
798 buf[bufact-1]. Now find out the value for this character
799 in the charmap as well as in the repertoire map (in this
800 order). */
801 seq = charmap_find_value (charmap, name: &buf[startidx],
802 len: bufact - startidx);
803
804 if (seq == NULL)
805 {
806 /* This name is not in the charmap. */
807 lr_error (lr, _("symbol `%.*s' not in charmap"),
808 (int) (bufact - startidx), &buf[startidx]);
809 illegal_string = 1;
810 }
811
812 if (return_widestr)
813 {
814 /* Now the same for the multibyte representation. */
815 if (seq != NULL && seq->ucs4 != UNINITIALIZED_CHAR_VALUE)
816 wch = seq->ucs4;
817 else
818 {
819 wch = repertoire_find_value (repertoire, name: &buf[startidx],
820 len: bufact - startidx);
821 if (seq != NULL)
822 seq->ucs4 = wch;
823 }
824
825 if (wch == ILLEGAL_CHAR_VALUE)
826 {
827 /* This name is not in the repertoire map. */
828 lr_error (lr, _("symbol `%.*s' not in repertoire map"),
829 (int) (bufact - startidx), &buf[startidx]);
830 illegal_string = 1;
831 }
832 else
833 ADDWC (wch);
834 }
835
836 /* Now forget about the name we just added. */
837 bufact = startidx;
838
839 /* And copy the bytes. */
840 if (seq != NULL)
841 ADDS (seq->bytes, seq->nbytes);
842 }
843
844 if (ch == '\n' || ch == EOF)
845 {
846 lr_error (lr, _("unterminated string"));
847 illegal_string = 1;
848 }
849
850 if (illegal_string)
851 {
852 free (ptr: buf);
853 free (ptr: buf2);
854 lr->token.val.str.startmb = NULL;
855 lr->token.val.str.lenmb = 0;
856 lr->token.val.str.startwc = NULL;
857 lr->token.val.str.lenwc = 0;
858
859 return &lr->token;
860 }
861
862 ADDC ('\0');
863
864 if (return_widestr)
865 {
866 ADDWC (0);
867 lr->token.val.str.startwc = xrealloc (o: buf2,
868 n: buf2act * sizeof (uint32_t));
869 lr->token.val.str.lenwc = buf2act;
870 }
871 }
872
873 lr->token.val.str.startmb = xrealloc (o: buf, n: bufact);
874 lr->token.val.str.lenmb = bufact;
875
876 return &lr->token;
877}
878

source code of glibc/locale/programs/linereader.c