Project_CodeNet/tools/tokenizer/libtoken.c at main · IBM/Project_CodeNet · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/* Copyright (c) 2021, 2022 International Business Machines Corporation
   Prepared by: Geert Janssen <geert@us.ibm.com>

   Code functionality shared by all tokenizers.
   This obviously avoids code duplication and associated maintenance problems.
*/

#include "libtoken.h"

// Program globals:
const char *filename = "stdin";  // current file being parsed
unsigned linenr = 1;       // physical line number counted from 1
unsigned column = 0;       // byte position in physical line, from 0
unsigned char_count = 0;   // total byte count
unsigned utf8_count = 0;   // total utf-8 encoded unicode codepoints

int buffer[MAX_BUF];       // use buffer as multi-char lookahead.
unsigned buffered = 0;     // number of buffered bytes
unsigned saved_col = 0;    // one-place buf for last column on prev line

// Program option settings:
int debug = 0;             // when 1 debug output to stderr
int verbose = 0;           // when 1 info output to stderr
int nowarn = 0;            // when 1 warnings are suppressed

unsigned illegals = 0;     // count number of illegal characters
unsigned unexpect_eof = 0; // encountered unexpected EOF
int hash_as_comment = 0;   // when 1 treat # as line comment
int newline_token = 0;     // when 1 output newline pseudo-token
int comment_token = 0;     // when 1 output comments as tokens
int whitespace_token = 0;  // when 1 output adjacent white-space as a token
int continuation_token = 0;  // when 1 output line continuation pseudo-token

static int logical_lines = 0;     // when 1 ignore line continuations in get)

// Must be synced with enum TokenClass!
const char *token_class[] = {
  /* 0*/ "identifier",
  /* 1*/ "keyword",
  /* 2*/ "string",
  /* 3*/ "character",
  /* 4*/ "integer",
  /* 5*/ "floating",
  /* 6*/ "operator",
  /* 7*/ "preprocessor",
  /* 8*/ "line_comment",
  /* 9*/ "block_comment",
  /*10*/ "whitespace",
  /*11*/ "newline",
  /*12*/ "continuation",
  /*13*/ "filename",
  /*14*/ "endoffile"
};

/* No longer using perfect hash function but simple binary search. */

/* C11 n1570.pdf 6.4.1 (44)
   C17 n2176.pdf 6.4.1 (A.1.2) (44)
*/
static const char *C_keywords[] = {
  "_Alignas",   "_Alignof",     "_Atomic",      "_Bool",        "_Complex",
  "_Generic",   "_Imaginary",   "_Noreturn",    "_Static_assert",
  "_Thread_local",

  "auto",       "break",        "case",         "char",         "const",
  "continue",   "default",      "do",           "double",       "else",
  "enum",       "extern",       "float",        "for",          "goto",
  "if",         "inline",       "int",          "long",         "register",
  "restrict",   "return",       "short",        "signed",       "sizeof",
  "static",     "struct",       "switch",       "typedef",      "union",
  "unsigned",   "void",         "volatile",     "while"
};

#if 0
/* C++ 2014 n4296.pdf 2.11 (84) */
static const char *CPP_keywords[] = {
  "alignas",       "alignof",       "and",           "and_eq",     "asm",
  "auto",          "bitand",        "bitor",         "bool",       "break",
  "case",          "catch",         "char",          "char16_t",   "char32_t",
  "class",         "compl",         "const",         "const_cast", "constexpr",
  "continue",      "decltype",      "default",       "delete",     "do",
  "double",        "dynamic_cast",  "else",          "enum",       "explicit",
  "export",        "extern",        "false",         "float",      "for",
  "friend",        "goto",          "if",            "inline",     "int",
  "long",          "mutable",       "namespace",     "new",        "noexcept",
  "not",           "not_eq",        "nullptr",       "operator",   "or",
  "or_eq",         "private",       "protected",     "public",     "register",
  "reinterpret_cast", "return",     "short",         "signed",     "sizeof",
  "static",        "static_assert", "static_cast",   "struct",     "switch",
  "template",      "this",          "thread_local",  "throw",      "true",
  "try",           "typedef",       "typeid",        "typename",   "union",
  "unsigned",      "using",         "virtual",       "void",       "volatile",
  "wchar_t",       "while",         "xor",           "xor_eq"
};
#endif

/* C++23 n4885.pdf 5.11 (92) */
static const char *CPP_keywords[] = {
  "alignas",       "alignof",       "and",           "and_eq",     "asm",
  "auto",          "bitand",        "bitor",         "bool",       "break",
  "case",          "catch",         "char",          "char16_t",   "char32_t",
  "char8_t",       "class",         "co_await",      "co_return",  "co_yield",
  "compl",         "concept",       "const",         "const_cast", "consteval",
  "constexpr",     "constinit",     "continue",      "decltype",   "default",
  "delete",        "do",            "double",        "dynamic_cast", "else",
  "enum",          "explicit",      "export",        "extern",     "false",
  "float",         "for",           "friend",        "goto",       "if",
  "inline",        "int",           "long",          "mutable",    "namespace",
  "new",           "noexcept",      "not",           "not_eq",     "nullptr",
  "operator",      "or",            "or_eq",         "private",    "protected",
  "public",        "register",      "reinterpret_cast", "requires","return",
  "short",         "signed",        "sizeof",        "static",  "static_assert",
  "static_cast",   "struct",        "switch",        "template",   "this",
  "thread_local",  "throw",         "true",          "try",        "typedef",
  "typeid",        "typename",      "union",         "unsigned",   "using",
  "virtual",       "void",          "volatile",      "wchar_t",    "while",
  "xor",           "xor_eq"
};

/* Java SE 8 (50) (false, true, null are literals) */
/* https://docs.oracle.com/javase/specs/jls/se8/html/jls-3.html#jls-3.9 */
static const char *Java_keywords[] = {
  "abstract", "assert",     "boolean", "break",     "byte",      "case",
  "catch",    "char",       "class",   "const",     "continue",  "default",
  "do",       "double",     "else",    "enum",      "extends",   "final",
  "finally",  "float",      "for",     "goto",      "if",        "implements",
  "import",   "instanceof", "int",     "interface", "long",      "native",
  "new",      "package",    "private", "protected", "public",    "return",
  "short",    "static",     "strictfp","super",     "switch", "synchronized",
  "this",     "throw",      "throws",  "transient", "try",       "void",
  "volatile", "while"
};

static const char *Python_keywords[] = {
  "False",  "None",   "True",    "and",      "as",       "assert", "async",
  "await",  "break",  "class",   "continue", "def",      "del",    "elif",
  "else",   "except", "finally", "for",      "from",     "global", "if",
  "import", "in",     "is",      "lambda",   "nonlocal", "not",    "or",
  "pass",   "raise",  "return",  "try",      "while",    "with",   "yield"
};

/* Includes future reserved keywords, strict mode reserved words and module
   code reserved words, as well as all the older standards future reserved
   words, and the literals null, false, and true.
*/
static const char *JavaScript_keywords[] = {
  "abstract", "await",      "boolean",   "break",        "byte",
  "case",     "catch",      "char",      "class",        "const",
  "continue", "debugger",   "default",   "delete",       "do",
  "double",   "else",       "enum",      "export",       "extends",
  "false",    "final",      "finally",   "float",        "for",
  "function", "goto",       "if",        "implements",   "import",
  "in",       "instanceof", "int",       "interface",    "let",
  "long",     "native",     "new",       "null",         "package",
  "private",  "protected",  "public",    "return",       "short",
  "static",   "super",      "switch",    "synchronized", "this",
  "throw",    "throws",     "transient", "true",         "try",
  "typeof",   "var",        "void",      "volatile",     "while",
  "with",     "yield"
};

#define num_keywords(lang) sizeof(lang##_keywords)/sizeof(lang##_keywords[0]);

/* Generic binary search lookup in some keyword table.
   `word' to be searched must be NUL-terminated C string.
   `table' is array of const char * of `size' sorted alphabetically.
   Returns word found (i.e., pointer value in table) or 0.
*/
#define lang_is_keyword(lang)                                           \
  static const char *lang##_is_keyword(const char *word)                \
  {                                                                     \
    int i = 0, j = num_keywords(lang);                                  \
    while (i < j) {                                                     \
      int k = (i + j) >> 1 /* / 2 */;                                   \
      const char *kw = lang##_keywords[k];                              \
      int cmp = strcmp(word, kw);                                       \
      if (!cmp)                                                         \
        return kw;                                                      \
      if (cmp < 0) j = k; else i = k + 1;                               \
    }                                                                   \
    return 0;                                                           \
  }

/* Define individual is_keyword functions per language: */
/* C_is_keyword */
lang_is_keyword(C)
/* CPP_is_keyword */
lang_is_keyword(CPP)
/* Java_is_keyword */
lang_is_keyword(Java)
/* Python_is_keyword */
lang_is_keyword(Python)
/* JavaScript_is_keyword */
lang_is_keyword(JavaScript)

const char *(*is_keyword)(const char *) = C_is_keyword;

/* Conversion table from filename extension to language code.
   To find language code, consider all entries and check each ext
   against filename; matched language is langs[i].lang.
   Invariant: langs[X].lang == X for every Language value.
   String representation of language code is langs[X].name.

   Have certain config settings depend on the language.
   Use 2 step:
   1. determine language from name/extension
   2. look up language config
*/
static const struct {
  const char *ext;
  Language lang;
  const char *name;
}
  langs[] = {
    { ".c",    C,          "C" },
    { ".cpp",  CPP,        "C++" },
    { ".java", JAVA,       "Java" },
    { ".js",   JAVASCRIPT, "JavaScript" },
    { ".py",   PYTHON,     "Python" },

    // Alternatives:
    { ".h",    C,          "" },
    { ".C",    CPP,        "" },
    { ".cc",   CPP,        "" },
    { ".hh",   CPP,        "" },
};

const char *lang_name(Language lang)
{
  return langs[lang].name;
}

static const struct {
  //Language lang; implicit
  const char *(*is_keyword)(const char *);
}
  lang_configs[] = {
    { C_is_keyword,          },
    { CPP_is_keyword,        },
    { Java_is_keyword,       },
    { JavaScript_is_keyword, },
    { Python_is_keyword,     },
};

/* Must be called right after a file is opened as stdin.
   Will attempt to remove any UTF-8 unicode signature (byte-order mark, BOM)
   at the beginning of the file.
   Unicode: U+FEFF
   UTF-8: EF BB BF

   First bytes Encoding              Must remove?
   00 00 FE FF UTF-32 big endian     Yes
   FF FE 00 00 UTF-32 little endian  Yes
   FE FF       UTF-16 big endian     Yes
   FF FE       UTF-16 little endian  Yes
   00 00 00 xx UTF-32 big endian     No
   xx 00 00 00 UTF-32 little endian  No
   00 xx       UTF-16 big endian     No
   xx 00       UTF-16 little endian  No
   otherwise   UTF-8                 No
*/
static void remove_BOM(void)
{
  int c1 = getchar();
  if (c1 == 0xEF) {
    int c2 = getchar();
    if (c2 == 0xBB) {
      int c3 = getchar();
      if (c3 == 0xBF) {
        return;
      }
      if (c3 != EOF) buffer[buffered++] = c3;
    }
    if (c2 != EOF) buffer[buffered++] = c2;
  }
  if (c1 != EOF) buffer[buffered++] = c1;
}

int open_as_stdin(const char *file)
{
  filename = file;
  if (!freopen(filename, "r", stdin)) {
    if (!nowarn)
      fprintf(stderr, "(W): Cannot read file %s.\n", filename);
    return -1;
  }
  return set_or_detect_lang(0);
}

/* Deal with DOS (\r \n) and classic Mac OS (\r) (physical) line endings.
   In case of CR LF skip (but count) the CR and return LF.
   In case of CR not followed by LF turns the CR into LF and returns that.
   All other chars are returned as is.
   Note: never returns a CR (\r). Line/column counts are not affected here.
*/
static int normalize_newline(void)
{
  /* No need to recognize Unicode code points here. */
  int cc = getchar();

  if (cc == '\r') {
    // Maybe \r \n (CR NL) combination?
    int nc = getchar();
    if (nc == '\n') {
      char_count++; // counts the carriage return
      utf8_count++;
      // No use incrementing column.
      return nc; // return \n; effectively skipping the \r
    }
    // Mind nc not \n. ungetc(EOF) is Okay.
    ungetc(nc, stdin);
    // cc == '\r'; consider a newline as well, so turn into \n:
    cc = '\n';
  }
  return cc;
}

/* Detects escaped newlines (line continuations) and signals them with the
   special '\r' character (that otherwise is not used).
   Keeps track of physical coordinates and absolute location for each character.
*/
int get(void)
{
  int cc;

 restart:
  // Get the next character:
  if (buffered) { // chars available in lookahead buffer
    cc = buffer[--buffered]; // never EOF
    char_count++;
    // cc maybe '\r' (line continuation); act like '\n':
    if (cc == '\n' || cc == '\r') {
      linenr++;
      saved_col = column;
      column = 0;
      return cc;
    }
    column++;
    return cc;
  }

  // Read a fresh char:
  cc = normalize_newline(); // cc != '\r'
  if (cc == EOF) return EOF;
  char_count++;
  if (utf8_start(cc)) utf8_count++;

  if (cc == '\n') { // a normalized end-of-line (\r|\r?\n)
    linenr++;
    saved_col = column;
    column = 0;
    return cc; // \n here signals a logical end-of-line
  }

  // Deal with explicit \ line continuations!
  if (cc == '\\') {
    // Must look ahead (never maintained across get calls!):
    int nc = normalize_newline(); // cc != '\r'
    if (nc == '\n') {
      char_count++; // counts the newline
      utf8_count++;
      linenr++;     // on next physical line
      saved_col = column+1; // +1 for backslash
      column = 0;

      if (logical_lines)
        // Still need to get a character.
        // Could again start a line continuation!
        goto restart;

      // Signal that this was an escaped newline (= line continuation):
      return '\r';
    }
    // Mind nc not \n. ungetc(EOF) is Okay.
    ungetc(nc, stdin);
    // cc == '\\' a regular backslash
  }
  column++;
  return cc;
}

/* Undo action of a get() lookahead call.
   An attempt at undoing an EOF read has no effect.
   Since get() encodes logical line endings with \n and continuation
   line endings with \r, both could be subject to an unget().
*/
void unget(int cc)
{
  if (cc == EOF) return;
  if (buffered < MAX_BUF) {
    if (cc == '\n' || cc == '\r') {
      linenr--;
      // column was 0 right after getting the \n
      // hopefully there are no multiple ungets of \n
      column = saved_col;
    }
    else
      column--;
    char_count--;
    buffer[buffered++] = cc;
  }
  else {
    fprintf(stderr, "(F): Lookahead buffer overflow (MAX=%u).\n", MAX_BUF);
    exit(2);
  }
}

/* Either set this file's input language explicitly via a string or
   use the filename extension to determine the language.
   If neither works out, use the default language C.
   Uses global filename (maybe stdin).
   Once the language is known, configs for that language are applied,
   e.g. the correct keyword table to use.
*/
Language set_or_detect_lang(const char *source)
{
  int i;
  Language lang = C; // default language

  if (source) {
    /* Check if explicit language is known: */
    for (i = 0; i < sizeof(langs)/sizeof(langs[0]); i++)
      if (!strcmp(source, langs[i].name)) {
        lang = langs[i].lang;
        goto done;
      }
    fprintf(stderr, "(E): No support for language `%s'.\n", source);
  }

  char *p;
  if (p = strrchr(filename, '.')) {
    for (i = 0; i < sizeof(langs)/sizeof(langs[0]); i++)
      if (!strcmp(p, langs[i].ext)) {
        lang = langs[i].lang;
        goto done;
      }
    fprintf(stderr, "(E): Unknown filename extension `%s'.\n", p);
  }
  if (!nowarn)
    fprintf(stderr, "(W): Assuming default language C.\n");

 done:
  is_keyword = lang_configs[lang].is_keyword;
  return lang;
}

// Dynamically sized token buffer:
static char *token_buf = 0;
static unsigned token_alloc = 0;
static unsigned token_len = 0;

// Makes sure there is room in the token buffer.
static void token_buf_room(void)
{
  if (token_len == token_alloc) { // all space used up
    if (!token_alloc) { // first time allocation
      token_alloc = 65536;
      if (!(token_buf = malloc(token_alloc))) {
        fprintf(stderr, "(F): Allocation of token buffer failed.\n");
        exit(4);
      }
      token_buf[0] = '\0'; // for safety
      return;
    }

    token_alloc <<= 1;
    if (!(token_buf = realloc(token_buf, token_alloc))) {
      fprintf(stderr, "(F): Reallocation of token buffer failed.\n");
      exit(4);
    }
    //fprintf(stderr, "Realloc-ed token buf.\n");
  }
}

// Appends a character to the token buffer, always making sure there is room.
static void token_buf_push(int cc)
{
  token_buf_room();
  // There is room: token_len < token_alloc
  token_buf[token_len++] = cc;
}

// Undoes the push action but only if there is some content.
static int token_buf_pop(void)
{
  return token_len ? token_buf[--token_len] : 0;
}

// Adds a terminating NUL character which does not change the token length.
static void token_buf_close(void)
{
  token_buf_room();
  token_buf[token_len] = '\0'; // Note: no advance
}

// Resets the token buffer cursor.
static void token_buf_reset(void)
{
  token_len = 0;
}

/* Tokenization of C++ programming language source text.
   Recognizes:
   - identifier
   - reserved word/keyword
   - binary, octal, decimal, hexadecimal and floating-point numbers
   - double-quoted string literal
   - single-quoted character literal
   - all single, double, and triple operator and punctuation symbols
   - the preprocessor tokens # and ##
   Optionally:
   - filename       start_token
   - line_comment   comment_token
   - block_comment  comment_token
   - newline        newline_token
   - continuation   continuation_token
   - whitespace     whitespace_token

   Normally skips white-space and comments and flags anything
   left over as illegal characters.

   (Approximately 20 tests per single character worst-case.)

   Returns 0 upon EOF else the token length in bytes.
   (There are no 0-length tokens!)
   EOF may be interpreted as a token. The function then returns:
   token = "", type = endoffile, line and col correctly defined.

   An unexpected EOF in the middle of a token will cause an error message
   and the partial token to be output first before a next call returns 0
   (to indicate the EOF condition).
*/

unsigned C_tokenize_int(const char **token, enum TokenClass *type,
			unsigned *line, unsigned *col, unsigned *pos)
{
  int cc;
  *type = ENDOFFILE;

  do { // infinite loop; after token recognized breaks out.
    // Start collecting a token.
    token_buf_reset();
    *line = linenr;
    *col = column;
    *pos = char_count;
    // white-space tokens see continuation lines:
    logical_lines = 0;
    cc = get();

  restart:
    // cc already read; coordinates for it are correct.

    /*** WHITE-SPACE ***/

    /* In principle all consecutive white-space including \n and \r (and some
       other control chars) are collected and form a single whitespace token.
       However, when newlines are requested to be reported as separate tokens,
       they break this pattern. Note that we cannot issues multiple tokens
       in a single call to this function.

       Token buf will only hold some white-space chars when implicitly
       requested via whitespace_token; otherwise stays empty.
       Same for the \n and \r requests.
     */

    if (cc == '\n' && newline_token) { // end of a logical line
      // Here we assume the buf is empty.
      token_buf_push(cc);
      *type = NEWLINE;
      break;
    }

    if (cc == '\r' && continuation_token) { // end of a physical line
      // Here we assume the buf is empty.
      token_buf_push('\\');
      token_buf_push('\n');
      *type = CONTINUATION;
      break;
    }

    // Aggregate as much white-space as possible.
    // FIXME: officially a NUL should be considered white-space.
    while (isspace(cc)) {	// i.e., cc in [ \f\n\r\t\v]
      // Here: !newline_token (!continuation_token)
      if (whitespace_token)
        if (cc == '\r') { // line continuation
          // Convert back to original char sequence:
          token_buf_push('\\');
          token_buf_push('\n');
        }
        else
          token_buf_push(cc); // perhaps \n
      //else: white-space is discarded

      // Here: whitespace_token implies token_len > 0

      cc = get();
      if (cc == '\n' && newline_token ||
	  cc == '\r' && continuation_token) {
	// Must issue whitespace token if so requested.
	if (whitespace_token) {
	  // Undo lookahead (unget(EOF) has no effect!):
	  unget(cc); // next token will be newline/continuation
	  *type = WHITESPACE;
	  token_buf_close();
	  *token = token_buf;
	  return token_len;
	}
	// Issue newline/continuation token right away:
	goto restart;
      }
    }
    // Here: !isspace: must break or start real token.

    if (whitespace_token && token_len) {
      // Undo lookahead (unget(EOF) has no effect!):
      unget(cc);
      *type = WHITESPACE;
      break;
    }

    if (cc == EOF) {
      token_buf_reset();
      break;
    }

    // Rest of tokens treat line continuations as non-existent:
    logical_lines = 1;

    // If white-space skipped must reset coordinates:
    *line = linenr;
    *col = column-1;
    *pos = char_count-1;

    /*** OPTIONAL # LINE COMMENT (to ignore preprocessor statements) ***/
    // Java: no preprocessor directives.

    // NULs (like many other chars) in comments are silently ignored!

    if (cc == '#' && hash_as_comment) {
      if (comment_token)
        token_buf_push(cc);
      // Skip till end-of-line (\n exclusive):
      while ((cc = get()) != '\n' && cc != EOF)
        if (comment_token)
          token_buf_push(cc);
      // cc == '\n' || cc == EOF
      // Don't consider \n part of comment.
      if (comment_token) {
	// Undo lookahead (unget(EOF) has no effect!):
        unget(cc);
        *type = LINE_COMMENT;
        break;
      }
      *line = linenr-1;
      *col = saved_col;
      *pos = char_count;
      goto restart;
    }

    /*** LINE COMMENT AND BLOCK COMMENT (C/C++/Java) ***/

    if (cc == '/') {
      cc = get();
      if (cc == '/') {
        if (comment_token) {
          token_buf_push(cc);
          token_buf_push(cc);
        }
        // Skip till end-of-line (\n exclusive):
        while ((cc = get()) != '\n' && cc != EOF)
          if (comment_token)
            token_buf_push(cc);
        // cc == '\n' || cc == EOF
        // Don't consider \n part of comment.
        if (comment_token) {
	  // Undo lookahead (unget(EOF) has no effect!):
          unget(cc);
          *type = LINE_COMMENT;
          break;
        }
	*line = linenr-1;
	*col = saved_col;
	*pos = char_count;
        goto restart;
      }

      if (cc == '*') {
        if (comment_token) {
          token_buf_push('/');
          token_buf_push(cc);
        }
        // Skip till */ inclusive:
        int nc = get(); // if EOF next get will be EOF too
        if (comment_token && nc != EOF)
          token_buf_push(nc);
        do {
          cc = nc;
          nc = get();
          if (nc == EOF) { // Error!
            fprintf(stderr,
                    "(E): [%s:%u] Unexpected end-of-file in /* comment.\n",
                    filename, *line);
            unexpect_eof++;
	    if (comment_token)
	      // Better return partial comment as token and postpone EOF:
	      *type = BLOCK_COMMENT;
	    else
	      token_buf_reset();
	    token_buf_close();
	    *token = token_buf;
            return token_len;
          }
          if (comment_token)
            token_buf_push(nc);
        } while (cc != '*' || nc != '/');
        // cc == '*' && nc == '/'
        // Don't consider char right after */ as part of comment.
        if (comment_token) {
          *type = BLOCK_COMMENT;
          break;
        }
	*line = linenr;
	*col = column;
	*pos = char_count;
        cc = get();
        goto restart;
      }
      // seen / but not // or /*
      unget(cc); // char after /
      cc = '/'; // restore /
    }

    // If white-space and/or comments skipped must reset coordinates:
    *line = linenr;
    *col = column-1;
    *pos = char_count-1;

    /*** CHAR and STRING PREFIX (C/C++) ***/

    // Allow u,U,L prefix for string and char
    // FIXME: allow u8 as prefix for string
    if (cc == 'L' || cc == 'u' || cc == 'U') {
      token_buf_push(cc);
      cc = get();
      if (cc == '"')
        goto string_token;
      if (cc == '\'')
        goto char_token;
      // u,U,L will be interpreted as (start of) identifier.
      unget(cc); // char after u,U,L
      cc = token_buf_pop(); // restore original and remove from token
    }

    /*** IDENTIFIER (C/C++/Java) and KEYWORD (C/C++) ***/
    // Java: false, true, null are literals
    // FIXME: Flag to allow .letter as part of identifier?
    // (compound identifier)

    // Simplistic solution to allowing Unicode: allow any char >= 128 without
    // actual checking for UTF-8.
    if (isalpha(cc) || cc == '_' || cc == '$' || (cc & 0x80)) {
      token_buf_push(cc);
      while (isalnum(cc = get()) || cc == '_' || cc == '$' ||
             cc != EOF && (cc & 0x80))
        token_buf_push(cc);
      unget(cc);
      token_buf_close();
      *type = is_keyword(token_buf) ? KEYWORD : IDENTIFIER;
      break;
    }

    /*** INTEGER and FLOATING ***/
    // Java: uses _ in numbers as insignificant separator
    // Java: decimal suffix: [lL], float suffix: [fFdD]
    // Java: allows hex float

#if 0
    // Examples:
    int bin_num = 0B010101u;
    int oct_num = 01234567L;
    int hex_num = 0x123ABCLL;
    int dec_num = 12345678;

    float flt_num1 = 077.;
    float flt_num2 = 077.987;
    float flt_num3 = 77.;
    float flt_num4 = .77;
#endif

    // . digits ... floating
    if (cc == '.') {
      // Look ahead for a digit:
      int nc;
      if (isdigit(nc = get())) {
        unget(nc);
        goto start_fraction;
      }
      unget(nc);
      // Could go immediately to operator: goto seen_period
    }

    if (isdigit(cc)) { // binary, octal, decimal, or hexadecimal literal
      // Types of integer literals:
      enum {
        BIN, OCT, DEC, HEX
      } int_lit = cc == '0' ? OCT : DEC;

      // Lookahead:
      int nc = get();
      if (int_lit == OCT && (nc == 'x' || nc == 'X')) {
        int_lit = HEX;
        token_buf_push(cc); // the 0
        cc = nc; // the x or X
      }
      else
      if (int_lit == OCT && (nc == 'b' || nc == 'B')) {
        int_lit = BIN;
        token_buf_push(cc); // the 0
        cc = nc; // the b or B
      }
      else
        unget(nc); // isdigit(cc)

      do {
        token_buf_push(cc);
        cc = get();

        // Allow for ' between `digits':
        if (cc == '\'') {
          // Keep the ' in the token for now:
          token_buf_push(cc);
          int nc = get();
          if (isdigit(nc) || int_lit == HEX && isxdigit(nc))
            cc = nc;
          else { // Error!
            fprintf(stderr,
                    "(E): [%s:%u] C++14 only allows ' between digits.\n",
                    filename, linenr);
            // what to do?
          }
        }
      } while (isdigit(cc) || int_lit == HEX && isxdigit(cc));
      // !is[x]digit(cc)

      // FIXME: allow hex floats in C
      if (int_lit == OCT || int_lit == DEC) {
        int floating = 0;
        // Seen digits-sequence. Maybe followed by . or e or E?
        if (cc == '.') { // fractional part
        start_fraction:
          floating = 1;
          token_buf_push(cc);
          // digits? FIXME: again allow ' between digits
          while (isdigit(cc = get()))
            token_buf_push(cc);
          // !isdigit(cc)
        }
        // cc != '.' || !isdigit(cc)
        if (cc == 'e' || cc == 'E') { // exponent
          floating = 1;
          token_buf_push(cc);
          if ((cc = get()) == '-' || cc == '+') {
            token_buf_push(cc);
            cc = get();
          }
          // FIXME: no check for at least 1 digit
          // FIXME: again allow ' between digits
          while (isdigit(cc)) {
            token_buf_push(cc);
            cc = get();
          }
          // !isdigit(cc)
        }
        if (floating) {
          if (cc == 'f' || cc == 'F' || cc == 'l' || cc == 'L')
            token_buf_push(cc);
          else
            unget(cc);
          *type = FLOATING;
          break;
        }
      }

      // optional integer suffix: l, ll, lu, llu, u, ul, ull, any case
      if (cc == 'l' || cc == 'L') {
        token_buf_push(cc);
        // maybe another l
        cc = get();
        if (cc == 'l' || cc == 'L') {
          token_buf_push(cc);
          // Here: token is digits[lL][lL]
          cc = get();
        }
        // maybe a u
        if (cc == 'u' || cc == 'U')
          // Here: token is digits[lL][lL]?[u|U]
          token_buf_push(cc);
        else
          unget(cc);
      }
      else if (cc == 'u' || cc == 'U') {
        token_buf_push(cc);
        // maybe an l
        cc = get();
        if (cc == 'l' || cc == 'L') {
          token_buf_push(cc);
          // Here: token is digits[uU][lL]
          cc = get();
        }
        // maybe another l
        if (cc == 'l' || cc == 'L')
          // Here: token is digits[uU][lL]?[lL]
          token_buf_push(cc);
        else
          unget(cc);
      }
      else
        unget(cc);
      *type = INTEGER;
      break;
    }

    /*** STRING (C/C++/Java) ***/

    if (cc == '"') {
    string_token:
      token_buf_push(cc);
      // Watch out for escaped " inside string.
      cc = get();
      while (cc != '"') {
        if (cc == EOF) { // Error!
          fprintf(stderr,
                  "(E): [%s:%u] Unexpected end-of-file in string literal.\n",
                  filename, *line);
          unexpect_eof++;
	  // Better return partial string as token and postpone EOF:
	  *type = STRING;
	  token_buf_close();
	  *token = token_buf;
	  return token_len;
        }
        token_buf_push(cc);
        int nc = get();

        if (cc == '\\') {
          // FIXME: No check on valid escape char!
          // ' " ? \ a b f n r t v
          token_buf_push(nc);
          cc = get();
        }
        else
          cc = nc;
      }
      // cc == '"'
      token_buf_push(cc);
      *type = STRING;
      break;
    }

    /*** CHARACTER (C/C++/Java) ***/

    if (cc == '\'') {
    char_token:
      token_buf_push(cc);
      // Watch out for escaped ' inside char.
      cc = get();
      // Cannot have empty char!
      if (cc == '\'') {
	fprintf(stderr,
		"(E): [%s:%u] Cannot have an empty character literal.\n",
		filename, linenr);
	// Output as token anyway, but count as illegal:
	token_buf_push(cc);
	*type = CHARACTER;
	illegals++;
	break;
      }

      // FIXME: Avoid including too many chars.
      while (cc != '\'') {
        if (cc == EOF) { // Error!
          fprintf(stderr,
                  "(E): [%s:%u] Unexpected end-of-file in character literal.\n",
                  filename, linenr);
          unexpect_eof++;
	  // Better return partial character as token and postpone EOF:
	  *type = CHARACTER;
	  token_buf_close();
	  *token = token_buf;
	  return token_len;
        }
        if (cc == '\n') { // Error!
          fprintf(stderr,
                 "(E): [%s:%u] Cannot have end-of-line in character literal.\n",
                  filename, linenr);
	  illegals++;
	  // Immediately terminate character literal as if ' present.
	  // cc = '\''; make into valid literal??? No!
	  break;