gas/app.c

   1 /* This is the Assembler Pre-Processor
   2    Copyright (C) 1987-2020 Free Software Foundation, Inc.
   3
   4    This file is part of GAS, the GNU Assembler.
   5
   6    GAS is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 3, or (at your option)
   9    any later version.
  10
  11    GAS is distributed in the hope that it will be useful, but WITHOUT
  12    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  13    or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
  14    License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with GAS; see the file COPYING.  If not, write to the Free
  18    Software Foundation, 51 Franklin Street - Fifth Floor, Boston, MA
  19    02110-1301, USA.  */
  20
  21 /* Modified by Allen Wirfs-Brock, Instantiations Inc 2/90.  */
  22 /* App, the assembler pre-processor.  This pre-processor strips out
  23    excess spaces, turns single-quoted characters into a decimal
  24    constant, and turns the # in # <number> <filename> <garbage> into a
  25    .linefile.  This needs better error-handling.  */
  26
  27 #include "as.h"
  28
  29 #if (__STDC__ != 1)
  30 #ifndef const
  31 #define const  /* empty */
  32 #endif
  33 #endif
  34
  35 #ifdef H_TICK_HEX
  36 int enable_h_tick_hex = 0;
  37 #endif
  38
  39 #ifdef TC_M68K
  40 /* Whether we are scrubbing in m68k MRI mode.  This is different from
  41    flag_m68k_mri, because the two flags will be affected by the .mri
  42    pseudo-op at different times.  */
  43 static int scrub_m68k_mri;
  44
  45 /* The pseudo-op which switches in and out of MRI mode.  See the
  46    comment in do_scrub_chars.  */
  47 static const char mri_pseudo[] = ".mri 0";
  48 #else
  49 #define scrub_m68k_mri 0
  50 #endif
  51
  52 #if defined TC_ARM && defined OBJ_ELF
  53 /* The pseudo-op for which we need to special-case `@' characters.
  54    See the comment in do_scrub_chars.  */
  55 static const char   symver_pseudo[] = ".symver";
  56 static const char * symver_state;
  57 #endif
  58 #ifdef TC_ARM
  59 static char last_char;
  60 #endif
  61
  62 static char lex[256];
  63 static const char symbol_chars[] =
  64 "$._ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
  65
  66 #define LEX_IS_SYMBOL_COMPONENT         1
  67 #define LEX_IS_WHITESPACE               2
  68 #define LEX_IS_LINE_SEPARATOR           3
  69 #define LEX_IS_COMMENT_START            4
  70 #define LEX_IS_LINE_COMMENT_START       5
  71 #define LEX_IS_TWOCHAR_COMMENT_1ST      6
  72 #define LEX_IS_STRINGQUOTE              8
  73 #define LEX_IS_COLON                    9
  74 #define LEX_IS_NEWLINE                  10
  75 #define LEX_IS_ONECHAR_QUOTE            11
  76 #ifdef TC_V850
  77 #define LEX_IS_DOUBLEDASH_1ST           12
  78 #endif
  79 #ifdef TC_M32R
  80 #define DOUBLEBAR_PARALLEL
  81 #endif
  82 #ifdef DOUBLEBAR_PARALLEL
  83 #define LEX_IS_DOUBLEBAR_1ST            13
  84 #endif
  85 #define LEX_IS_PARALLEL_SEPARATOR       14
  86 #ifdef H_TICK_HEX
  87 #define LEX_IS_H                        15
  88 #endif
  89 #define IS_SYMBOL_COMPONENT(c)          (lex[c] == LEX_IS_SYMBOL_COMPONENT)
  90 #define IS_WHITESPACE(c)                (lex[c] == LEX_IS_WHITESPACE)
  91 #define IS_LINE_SEPARATOR(c)            (lex[c] == LEX_IS_LINE_SEPARATOR)
  92 #define IS_PARALLEL_SEPARATOR(c)        (lex[c] == LEX_IS_PARALLEL_SEPARATOR)
  93 #define IS_COMMENT(c)                   (lex[c] == LEX_IS_COMMENT_START)
  94 #define IS_LINE_COMMENT(c)              (lex[c] == LEX_IS_LINE_COMMENT_START)
  95 #define IS_NEWLINE(c)                   (lex[c] == LEX_IS_NEWLINE)
  96
  97 static int process_escape (int);
  98
  99 /* FIXME-soon: The entire lexer/parser thingy should be
 100    built statically at compile time rather than dynamically
 101    each and every time the assembler is run.  xoxorich.  */
 102
 103 void
 104 do_scrub_begin (int m68k_mri ATTRIBUTE_UNUSED)
 105 {
 106   const char *p;
 107   int c;
 108
 109   lex[' '] = LEX_IS_WHITESPACE;
 110   lex['\t'] = LEX_IS_WHITESPACE;
 111   lex['\r'] = LEX_IS_WHITESPACE;
 112   lex['\n'] = LEX_IS_NEWLINE;
 113   lex[':'] = LEX_IS_COLON;
 114
 115 #ifdef TC_M68K
 116   scrub_m68k_mri = m68k_mri;
 117
 118   if (! m68k_mri)
 119 #endif
 120     {
 121       lex['"'] = LEX_IS_STRINGQUOTE;
 122
 123 #if ! defined (TC_HPPA)
 124       lex['\''] = LEX_IS_ONECHAR_QUOTE;
 125 #endif
 126
 127 #ifdef SINGLE_QUOTE_STRINGS
 128       lex['\''] = LEX_IS_STRINGQUOTE;
 129 #endif
 130     }
 131
 132   /* Note: if any other character can be LEX_IS_STRINGQUOTE, the loop
 133      in state 5 of do_scrub_chars must be changed.  */
 134
 135   /* Note that these override the previous defaults, e.g. if ';' is a
 136      comment char, then it isn't a line separator.  */
 137   for (p = symbol_chars; *p; ++p)
 138     lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
 139
 140   for (c = 128; c < 256; ++c)
 141     lex[c] = LEX_IS_SYMBOL_COMPONENT;
 142
 143 #ifdef tc_symbol_chars
 144   /* This macro permits the processor to specify all characters which
 145      may appears in an operand.  This will prevent the scrubber from
 146      discarding meaningful whitespace in certain cases.  The i386
 147      backend uses this to support prefixes, which can confuse the
 148      scrubber as to whether it is parsing operands or opcodes.  */
 149   for (p = tc_symbol_chars; *p; ++p)
 150     lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
 151 #endif
 152
 153   /* The m68k backend wants to be able to change comment_chars.  */
 154 #ifndef tc_comment_chars
 155 #define tc_comment_chars comment_chars
 156 #endif
 157   for (p = tc_comment_chars; *p; p++)
 158     lex[(unsigned char) *p] = LEX_IS_COMMENT_START;
 159
 160   for (p = line_comment_chars; *p; p++)
 161     lex[(unsigned char) *p] = LEX_IS_LINE_COMMENT_START;
 162
 163 #ifndef tc_line_separator_chars
 164 #define tc_line_separator_chars line_separator_chars
 165 #endif
 166   for (p = tc_line_separator_chars; *p; p++)
 167     lex[(unsigned char) *p] = LEX_IS_LINE_SEPARATOR;
 168
 169 #ifdef tc_parallel_separator_chars
 170   /* This macro permits the processor to specify all characters which
 171      separate parallel insns on the same line.  */
 172   for (p = tc_parallel_separator_chars; *p; p++)
 173     lex[(unsigned char) *p] = LEX_IS_PARALLEL_SEPARATOR;
 174 #endif
 175
 176   /* Only allow slash-star comments if slash is not in use.
 177      FIXME: This isn't right.  We should always permit them.  */
 178   if (lex['/'] == 0)
 179     lex['/'] = LEX_IS_TWOCHAR_COMMENT_1ST;
 180
 181 #ifdef TC_M68K
 182   if (m68k_mri)
 183     {
 184       lex['\''] = LEX_IS_STRINGQUOTE;
 185       lex[';'] = LEX_IS_COMMENT_START;
 186       lex['*'] = LEX_IS_LINE_COMMENT_START;
 187       /* The MRI documentation says '!' is LEX_IS_COMMENT_START, but
 188          then it can't be used in an expression.  */
 189       lex['!'] = LEX_IS_LINE_COMMENT_START;
 190     }
 191 #endif
 192
 193 #ifdef TC_V850
 194   lex['-'] = LEX_IS_DOUBLEDASH_1ST;
 195 #endif
 196 #ifdef DOUBLEBAR_PARALLEL
 197   lex['|'] = LEX_IS_DOUBLEBAR_1ST;
 198 #endif
 199 #ifdef TC_D30V
 200   /* Must do this is we want VLIW instruction with "->" or "<-".  */
 201   lex['-'] = LEX_IS_SYMBOL_COMPONENT;
 202 #endif
 203
 204 #ifdef H_TICK_HEX
 205   if (enable_h_tick_hex)
 206     {
 207       lex['h'] = LEX_IS_H;
 208       lex['H'] = LEX_IS_H;
 209     }
 210 #endif
 211 }
 212
 213 /* Saved state of the scrubber.  */
 214 static int state;
 215 static int old_state;
 216 static const char *out_string;
 217 static char out_buf[20];
 218 static int add_newlines;
 219 static char *saved_input;
 220 static size_t saved_input_len;
 221 static char input_buffer[32 * 1024];
 222 static const char *mri_state;
 223 static char mri_last_ch;
 224
 225 /* Data structure for saving the state of app across #include's.  Note that
 226    app is called asynchronously to the parsing of the .include's, so our
 227    state at the time .include is interpreted is completely unrelated.
 228    That's why we have to save it all.  */
 229
 230 struct app_save
 231 {
 232   int          state;
 233   int          old_state;
 234   const char * out_string;
 235   char         out_buf[sizeof (out_buf)];
 236   int          add_newlines;
 237   char *       saved_input;
 238   size_t       saved_input_len;
 239 #ifdef TC_M68K
 240   int          scrub_m68k_mri;
 241 #endif
 242   const char * mri_state;
 243   char         mri_last_ch;
 244 #if defined TC_ARM && defined OBJ_ELF
 245   const char * symver_state;
 246 #endif
 247 #ifdef TC_ARM
 248   char last_char;
 249 #endif
 250 };
 251
 252 char *
 253 app_push (void)
 254 {
 255   struct app_save *saved;
 256
 257   saved = XNEW (struct app_save);
 258   saved->state = state;
 259   saved->old_state = old_state;
 260   saved->out_string = out_string;
 261   memcpy (saved->out_buf, out_buf, sizeof (out_buf));
 262   saved->add_newlines = add_newlines;
 263   if (saved_input == NULL)
 264     saved->saved_input = NULL;
 265   else
 266     {
 267       saved->saved_input = XNEWVEC (char, saved_input_len);
 268       memcpy (saved->saved_input, saved_input, saved_input_len);
 269       saved->saved_input_len = saved_input_len;
 270     }
 271 #ifdef TC_M68K
 272   saved->scrub_m68k_mri = scrub_m68k_mri;
 273 #endif
 274   saved->mri_state = mri_state;
 275   saved->mri_last_ch = mri_last_ch;
 276 #if defined TC_ARM && defined OBJ_ELF
 277   saved->symver_state = symver_state;
 278 #endif
 279 #ifdef TC_ARM
 280   saved->last_char = last_char;
 281 #endif
 282
 283   /* do_scrub_begin() is not useful, just wastes time.  */
 284
 285   state = 0;
 286   saved_input = NULL;
 287   add_newlines = 0;
 288
 289   return (char *) saved;
 290 }
 291
 292 void
 293 app_pop (char *arg)
 294 {
 295   struct app_save *saved = (struct app_save *) arg;
 296
 297   /* There is no do_scrub_end ().  */
 298   state = saved->state;
 299   old_state = saved->old_state;
 300   out_string = saved->out_string;
 301   memcpy (out_buf, saved->out_buf, sizeof (out_buf));
 302   add_newlines = saved->add_newlines;
 303   if (saved->saved_input == NULL)
 304     saved_input = NULL;
 305   else
 306     {
 307       gas_assert (saved->saved_input_len <= sizeof (input_buffer));
 308       memcpy (input_buffer, saved->saved_input, saved->saved_input_len);
 309       saved_input = input_buffer;
 310       saved_input_len = saved->saved_input_len;
 311       free (saved->saved_input);
 312     }
 313 #ifdef TC_M68K
 314   scrub_m68k_mri = saved->scrub_m68k_mri;
 315 #endif
 316   mri_state = saved->mri_state;
 317   mri_last_ch = saved->mri_last_ch;
 318 #if defined TC_ARM && defined OBJ_ELF
 319   symver_state = saved->symver_state;
 320 #endif
 321 #ifdef TC_ARM
 322   last_char = saved->last_char;
 323 #endif
 324
 325   free (arg);
 326 }
 327
 328 /* @@ This assumes that \n &c are the same on host and target.  This is not
 329    necessarily true.  */
 330
 331 static int
 332 process_escape (int ch)
 333 {
 334   switch (ch)
 335     {
 336     case 'b':
 337       return '\b';
 338     case 'f':
 339       return '\f';
 340     case 'n':
 341       return '\n';
 342     case 'r':
 343       return '\r';
 344     case 't':
 345       return '\t';
 346     case '\'':
 347       return '\'';
 348     case '"':
 349       return '\"';
 350     default:
 351       return ch;
 352     }
 353 }
 354
 355 /* This function is called to process input characters.  The GET
 356    parameter is used to retrieve more input characters.  GET should
 357    set its parameter to point to a buffer, and return the length of
 358    the buffer; it should return 0 at end of file.  The scrubbed output
 359    characters are put into the buffer starting at TOSTART; the TOSTART
 360    buffer is TOLEN bytes in length.  The function returns the number
 361    of scrubbed characters put into TOSTART.  This will be TOLEN unless
 362    end of file was seen.  This function is arranged as a state
 363    machine, and saves its state so that it may return at any point.
 364    This is the way the old code used to work.  */
 365
 366 size_t
 367 do_scrub_chars (size_t (*get) (char *, size_t), char *tostart, size_t tolen)
 368 {
 369   char *to = tostart;
 370   char *toend = tostart + tolen;
 371   char *from;
 372   char *fromend;
 373   size_t fromlen;
 374   int ch, ch2 = 0;
 375   /* Character that started the string we're working on.  */
 376   static char quotechar;
 377
 378   /*State 0: beginning of normal line
 379           1: After first whitespace on line (flush more white)
 380           2: After first non-white (opcode) on line (keep 1white)
 381           3: after second white on line (into operands) (flush white)
 382           4: after putting out a .linefile, put out digits
 383           5: parsing a string, then go to old-state
 384           6: putting out \ escape in a "d string.
 385           7: no longer used
 386           8: no longer used
 387           9: After seeing symbol char in state 3 (keep 1white after symchar)
 388          10: After seeing whitespace in state 9 (keep white before symchar)
 389          11: After seeing a symbol character in state 0 (eg a label definition)
 390          -1: output string in out_string and go to the state in old_state
 391          -2: flush text until a '*' '/' is seen, then go to state old_state
 392 #ifdef TC_V850
 393          12: After seeing a dash, looking for a second dash as a start
 394              of comment.
 395 #endif
 396 #ifdef DOUBLEBAR_PARALLEL
 397          13: After seeing a vertical bar, looking for a second
 398              vertical bar as a parallel expression separator.
 399 #endif
 400 #ifdef TC_PREDICATE_START_CHAR
 401          14: After seeing a predicate start character at state 0, looking
 402              for a predicate end character as predicate.
 403          15: After seeing a predicate start character at state 1, looking
 404              for a predicate end character as predicate.
 405 #endif
 406 #ifdef TC_Z80
 407          16: After seeing an 'a' or an 'A' at the start of a symbol
 408          17: After seeing an 'f' or an 'F' in state 16
 409 #endif
 410           */
 411
 412   /* I added states 9 and 10 because the MIPS ECOFF assembler uses
 413      constructs like ``.loc 1 20''.  This was turning into ``.loc
 414      120''.  States 9 and 10 ensure that a space is never dropped in
 415      between characters which could appear in an identifier.  Ian
 416      Taylor, ian@cygnus.com.
 417
 418      I added state 11 so that something like "Lfoo add %r25,%r26,%r27" works
 419      correctly on the PA (and any other target where colons are optional).
 420      Jeff Law, law@cs.utah.edu.
 421
 422      I added state 13 so that something like "cmp r1, r2 || trap #1" does not
 423      get squashed into "cmp r1,r2||trap#1", with the all important space
 424      between the 'trap' and the '#1' being eliminated.  nickc@cygnus.com  */
 425
 426   /* This macro gets the next input character.  */
 427
 428 #define GET()                                                   \
 429   (from < fromend                                               \
 430    ? * (unsigned char *) (from++)                               \
 431    : (saved_input = NULL,                                       \
 432       fromlen = (*get) (input_buffer, sizeof input_buffer),     \
 433       from = input_buffer,                                      \
 434       fromend = from + fromlen,                                 \
 435       (fromlen == 0                                             \
 436        ? EOF                                                    \
 437        : * (unsigned char *) (from++))))
 438
 439   /* This macro pushes a character back on the input stream.  */
 440
 441 #define UNGET(uch) (*--from = (uch))
 442
 443   /* This macro puts a character into the output buffer.  If this
 444      character fills the output buffer, this macro jumps to the label
 445      TOFULL.  We use this rather ugly approach because we need to
 446      handle two different termination conditions: EOF on the input
 447      stream, and a full output buffer.  It would be simpler if we
 448      always read in the entire input stream before processing it, but
 449      I don't want to make such a significant change to the assembler's
 450      memory usage.  */
 451
 452 #define PUT(pch)                                \
 453   do                                            \
 454     {                                           \
 455       *to++ = (pch);                            \
 456       if (to >= toend)                          \
 457         goto tofull;                            \
 458     }                                           \
 459   while (0)
 460
 461   if (saved_input != NULL)
 462     {
 463       from = saved_input;
 464       fromend = from + saved_input_len;
 465     }
 466   else
 467     {
 468       fromlen = (*get) (input_buffer, sizeof input_buffer);
 469       if (fromlen == 0)
 470         return 0;
 471       from = input_buffer;
 472       fromend = from + fromlen;
 473     }
 474
 475   while (1)
 476     {
 477       /* The cases in this switch end with continue, in order to
 478          branch back to the top of this while loop and generate the
 479          next output character in the appropriate state.  */
 480       switch (state)
 481         {
 482         case -1:
 483           ch = *out_string++;
 484           if (*out_string == '\0')
 485             {
 486               state = old_state;
 487               old_state = 3;
 488             }
 489           PUT (ch);
 490           continue;
 491
 492         case -2:
 493           for (;;)
 494             {
 495               do
 496                 {
 497                   ch = GET ();
 498
 499                   if (ch == EOF)
 500                     {
 501                       as_warn (_("end of file in comment"));
 502                       goto fromeof;
 503                     }
 504
 505                   if (ch == '\n')
 506                     PUT ('\n');
 507                 }
 508               while (ch != '*');
 509
 510               while ((ch = GET ()) == '*')
 511                 ;
 512
 513               if (ch == EOF)
 514                 {
 515                   as_warn (_("end of file in comment"));
 516                   goto fromeof;
 517                 }
 518
 519               if (ch == '/')
 520                 break;
 521
 522               UNGET (ch);
 523             }
 524
 525           state = old_state;
 526           UNGET (' ');
 527           continue;
 528
 529         case 4:
 530           ch = GET ();
 531           if (ch == EOF)
 532             goto fromeof;
 533           else if (ch >= '0' && ch <= '9')
 534             PUT (ch);
 535           else
 536             {
 537               while (ch != EOF && IS_WHITESPACE (ch))
 538                 ch = GET ();
 539               if (ch == '"')
 540                 {
 541                   quotechar = ch;
 542                   state = 5;
 543                   old_state = 3;
 544                   PUT (ch);
 545                 }
 546               else
 547                 {
 548                   while (ch != EOF && ch != '\n')
 549                     ch = GET ();
 550                   state = 0;
 551                   PUT (ch);
 552                 }
 553             }
 554           continue;
 555
 556         case 5:
 557           /* We are going to copy everything up to a quote character,
 558              with special handling for a backslash.  We try to
 559              optimize the copying in the simple case without using the
 560              GET and PUT macros.  */
 561           {
 562             char *s;
 563             ptrdiff_t len;
 564
 565             for (s = from; s < fromend; s++)
 566               {
 567                 ch = *s;
 568                 if (ch == '\\'
 569                     || ch == quotechar
 570                     || ch == '\n')
 571                   break;
 572               }
 573             len = s - from;
 574             if (len > toend - to)
 575               len = toend - to;
 576             if (len > 0)
 577               {
 578                 memcpy (to, from, len);
 579                 to += len;
 580                 from += len;
 581                 if (to >= toend)
 582                   goto tofull;
 583               }
 584           }
 585
 586           ch = GET ();
 587           if (ch == EOF)
 588             {
 589               /* This buffer is here specifically so
 590                  that the UNGET below will work.  */
 591               static char one_char_buf[1];
 592
 593               as_warn (_("end of file in string; '%c' inserted"), quotechar);
 594               state = old_state;
 595               from = fromend = one_char_buf + 1;
 596               fromlen = 1;
 597               UNGET ('\n');
 598               PUT (quotechar);
 599             }
 600           else if (ch == quotechar)
 601             {
 602               state = old_state;
 603               PUT (ch);
 604             }
 605           else if (TC_STRING_ESCAPES && ch == '\\')
 606             {
 607               state = 6;
 608               PUT (ch);
 609             }
 610           else if (scrub_m68k_mri && ch == '\n')
 611             {
 612               /* Just quietly terminate the string.  This permits lines like
 613                    bne  label   loop if we haven't reach end yet.  */
 614               state = old_state;
 615               UNGET (ch);
 616               PUT ('\'');
 617             }
 618           else
 619             {
 620               PUT (ch);
 621             }
 622           continue;
 623
 624         case 6:
 625           state = 5;
 626           ch = GET ();
 627           switch (ch)
 628             {
 629               /* Handle strings broken across lines, by turning '\n' into
 630                  '\\' and 'n'.  */
 631             case '\n':
 632               UNGET ('n');
 633               add_newlines++;
 634               PUT ('\\');
 635               continue;
 636
 637             case EOF:
 638               as_warn (_("end of file in string; '%c' inserted"), quotechar);
 639               PUT (quotechar);
 640               continue;
 641
 642             case '"':
 643             case '\\':
 644             case 'b':
 645             case 'f':
 646             case 'n':
 647             case 'r':
 648             case 't':
 649             case 'v':
 650             case 'x':
 651             case 'X':
 652             case '0':
 653             case '1':
 654             case '2':
 655             case '3':
 656             case '4':
 657             case '5':
 658             case '6':
 659             case '7':
 660               break;
 661
 662             default:
 663 #ifdef ONLY_STANDARD_ESCAPES
 664               as_warn (_("unknown escape '\\%c' in string; ignored"), ch);
 665 #endif
 666               break;
 667             }
 668           PUT (ch);
 669           continue;
 670
 671 #ifdef DOUBLEBAR_PARALLEL
 672         case 13:
 673           ch = GET ();
 674           if (ch != '|')
 675             abort ();
 676
 677           /* Reset back to state 1 and pretend that we are parsing a
 678              line from just after the first white space.  */
 679           state = 1;
 680           PUT ('|');
 681 #ifdef TC_TIC6X
 682           /* "||^" is used for SPMASKed instructions.  */
 683           ch = GET ();
 684           if (ch == EOF)
 685             goto fromeof;
 686           else if (ch == '^')
 687             PUT ('^');
 688           else
 689             UNGET (ch);
 690 #endif
 691           continue;
 692 #endif
 693 #ifdef TC_Z80
 694         case 16:
 695           /* We have seen an 'a' at the start of a symbol, look for an 'f'.  */
 696           ch = GET ();
 697           if (ch == 'f' || ch == 'F')
 698             {
 699               state = 17;
 700               PUT (ch);
 701             }
 702           else
 703             {
 704               state = 9;
 705               break;
 706             }
 707           /* Fall through.  */
 708         case 17:
 709           /* We have seen "af" at the start of a symbol,
 710              a ' here is a part of that symbol.  */
 711           ch = GET ();
 712           state = 9;
 713           if (ch == '\'')
 714             /* Change to avoid warning about unclosed string.  */
 715             PUT ('`');
 716           else if (ch != EOF)
 717             UNGET (ch);
 718           break;
 719 #endif
 720         }
 721
 722       /* OK, we are somewhere in states 0 through 4 or 9 through 11.  */
 723
 724       /* flushchar: */
 725       ch = GET ();
 726
 727 #ifdef TC_PREDICATE_START_CHAR
 728       if (ch == TC_PREDICATE_START_CHAR && (state == 0 || state == 1))
 729         {
 730           state += 14;
 731           PUT (ch);
 732           continue;
 733         }
 734       else if (state == 14 || state == 15)
 735         {
 736           if (ch == TC_PREDICATE_END_CHAR)
 737             {
 738               state -= 14;
 739               PUT (ch);
 740               ch = GET ();
 741             }
 742           else
 743             {
 744               PUT (ch);
 745               continue;
 746             }
 747         }
 748 #endif
 749
 750     recycle:
 751
 752 #if defined TC_ARM && defined OBJ_ELF
 753       /* We need to watch out for .symver directives.  See the comment later
 754          in this function.  */
 755       if (symver_state == NULL)
 756         {
 757           if ((state == 0 || state == 1) && ch == symver_pseudo[0])
 758             symver_state = symver_pseudo + 1;
 759         }
 760       else
 761         {
 762           /* We advance to the next state if we find the right
 763              character.  */
 764           if (ch != '\0' && (*symver_state == ch))
 765             ++symver_state;
 766           else if (*symver_state != '\0')
 767             /* We did not get the expected character, or we didn't
 768                get a valid terminating character after seeing the
 769                entire pseudo-op, so we must go back to the beginning.  */
 770             symver_state = NULL;
 771           else
 772             {
 773               /* We've read the entire pseudo-op.  If this is the end
 774                  of the line, go back to the beginning.  */
 775               if (IS_NEWLINE (ch))
 776                 symver_state = NULL;
 777             }
 778         }
 779 #endif /* TC_ARM && OBJ_ELF */
 780
 781 #ifdef TC_M68K
 782       /* We want to have pseudo-ops which control whether we are in
 783          MRI mode or not.  Unfortunately, since m68k MRI mode affects
 784          the scrubber, that means that we need a special purpose
 785          recognizer here.  */
 786       if (mri_state == NULL)
 787         {
 788           if ((state == 0 || state == 1)
 789               && ch == mri_pseudo[0])
 790             mri_state = mri_pseudo + 1;
 791         }
 792       else
 793         {
 794           /* We advance to the next state if we find the right
 795              character, or if we need a space character and we get any
 796              whitespace character, or if we need a '0' and we get a
 797              '1' (this is so that we only need one state to handle
 798              ``.mri 0'' and ``.mri 1'').  */
 799           if (ch != '\0'
 800               && (*mri_state == ch
 801                   || (*mri_state == ' '
 802                       && lex[ch] == LEX_IS_WHITESPACE)
 803                   || (*mri_state == '0'
 804                       && ch == '1')))
 805             {
 806               mri_last_ch = ch;
 807               ++mri_state;
 808             }
 809           else if (*mri_state != '\0'
 810                    || (lex[ch] != LEX_IS_WHITESPACE
 811                        && lex[ch] != LEX_IS_NEWLINE))
 812             {
 813               /* We did not get the expected character, or we didn't
 814                  get a valid terminating character after seeing the
 815                  entire pseudo-op, so we must go back to the
 816                  beginning.  */
 817               mri_state = NULL;
 818             }
 819           else
 820             {
 821               /* We've read the entire pseudo-op.  mips_last_ch is
 822                  either '0' or '1' indicating whether to enter or
 823                  leave MRI mode.  */
 824               do_scrub_begin (mri_last_ch == '1');
 825               mri_state = NULL;
 826
 827               /* We continue handling the character as usual.  The
 828                  main gas reader must also handle the .mri pseudo-op
 829                  to control expression parsing and the like.  */
 830             }
 831         }
 832 #endif
 833
 834       if (ch == EOF)
 835         {
 836           if (state != 0)
 837             {
 838               as_warn (_("end of file not at end of a line; newline inserted"));
 839               state = 0;
 840               PUT ('\n');
 841             }
 842           goto fromeof;
 843         }
 844
 845       switch (lex[ch])
 846         {
 847         case LEX_IS_WHITESPACE:
 848           do
 849             {
 850               ch = GET ();
 851             }
 852           while (ch != EOF && IS_WHITESPACE (ch));
 853           if (ch == EOF)
 854             goto fromeof;
 855
 856           if (state == 0)
 857             {
 858               /* Preserve a single whitespace character at the
 859                  beginning of a line.  */
 860               state = 1;
 861               UNGET (ch);
 862               PUT (' ');
 863               break;
 864             }
 865
 866 #ifdef KEEP_WHITE_AROUND_COLON
 867           if (lex[ch] == LEX_IS_COLON)
 868             {
 869               /* Only keep this white if there's no white *after* the
 870                  colon.  */
 871               ch2 = GET ();
 872               if (ch2 != EOF)
 873                 UNGET (ch2);
 874               if (!IS_WHITESPACE (ch2))
 875                 {
 876                   state = 9;
 877                   UNGET (ch);
 878                   PUT (' ');
 879                   break;
 880                 }
 881             }
 882 #endif
 883           if (IS_COMMENT (ch)
 884               || ch == '/'
 885               || IS_LINE_SEPARATOR (ch)
 886               || IS_PARALLEL_SEPARATOR (ch))
 887             {
 888               if (scrub_m68k_mri)
 889                 {
 890                   /* In MRI mode, we keep these spaces.  */
 891                   UNGET (ch);
 892                   PUT (' ');
 893                   break;
 894                 }
 895               goto recycle;
 896             }
 897
 898           /* If we're in state 2 or 11, we've seen a non-white
 899              character followed by whitespace.  If the next character
 900              is ':', this is whitespace after a label name which we
 901              normally must ignore.  In MRI mode, though, spaces are
 902              not permitted between the label and the colon.  */
 903           if ((state == 2 || state == 11)
 904               && lex[ch] == LEX_IS_COLON
 905               && ! scrub_m68k_mri)
 906             {
 907               state = 1;
 908               PUT (ch);
 909               break;
 910             }
 911
 912           switch (state)
 913             {
 914             case 1:
 915               /* We can arrive here if we leave a leading whitespace
 916                  character at the beginning of a line.  */
 917               goto recycle;
 918             case 2:
 919               state = 3;
 920               if (to + 1 < toend)
 921                 {
 922                   /* Optimize common case by skipping UNGET/GET.  */
 923                   PUT (' ');    /* Sp after opco */
 924                   goto recycle;
 925                 }
 926               UNGET (ch);
 927               PUT (' ');
 928               break;
 929             case 3:
 930 #ifndef TC_KEEP_OPERAND_SPACES
 931               /* For TI C6X, we keep these spaces as they may separate
 932                  functional unit specifiers from operands.  */
 933               if (scrub_m68k_mri)
 934 #endif
 935                 {
 936                   /* In MRI mode, we keep these spaces.  */
 937                   UNGET (ch);
 938                   PUT (' ');
 939                   break;
 940                 }
 941               goto recycle;     /* Sp in operands */
 942             case 9:
 943             case 10:
 944 #ifndef TC_KEEP_OPERAND_SPACES
 945               if (scrub_m68k_mri)
 946 #endif
 947                 {
 948                   /* In MRI mode, we keep these spaces.  */
 949                   state = 3;
 950                   UNGET (ch);
 951                   PUT (' ');
 952                   break;
 953                 }
 954               state = 10;       /* Sp after symbol char */
 955               goto recycle;
 956             case 11:
 957               if (LABELS_WITHOUT_COLONS || flag_m68k_mri)
 958                 state = 1;
 959               else
 960                 {
 961                   /* We know that ch is not ':', since we tested that
 962                      case above.  Therefore this is not a label, so it
 963                      must be the opcode, and we've just seen the
 964                      whitespace after it.  */
 965                   state = 3;
 966                 }
 967               UNGET (ch);
 968               PUT (' ');        /* Sp after label definition.  */
 969               break;
 970             default:
 971               BAD_CASE (state);
 972             }
 973           break;
 974
 975         case LEX_IS_TWOCHAR_COMMENT_1ST:
 976           ch2 = GET ();
 977           if (ch2 == '*')
 978             {
 979               for (;;)
 980                 {
 981                   do
 982                     {
 983                       ch2 = GET ();
 984                       if (ch2 != EOF && IS_NEWLINE (ch2))
 985                         add_newlines++;
 986                     }
 987                   while (ch2 != EOF && ch2 != '*');
 988
 989                   while (ch2 == '*')
 990                     ch2 = GET ();
 991
 992                   if (ch2 == EOF || ch2 == '/')
 993                     break;
 994
 995                   /* This UNGET will ensure that we count newlines
 996                      correctly.  */
 997                   UNGET (ch2);
 998                 }
 999
1000               if (ch2 == EOF)
1001                 as_warn (_("end of file in multiline comment"));
1002
1003               ch = ' ';
1004               goto recycle;
1005             }
1006 #ifdef DOUBLESLASH_LINE_COMMENTS
1007           else if (ch2 == '/')
1008             {
1009               do
1010                 {
1011                   ch = GET ();
1012                 }
1013               while (ch != EOF && !IS_NEWLINE (ch));
1014               if (ch == EOF)
1015                 as_warn ("end of file in comment; newline inserted");
1016               state = 0;
1017               PUT ('\n');
1018               break;
1019             }
1020 #endif
1021           else
1022             {
1023               if (ch2 != EOF)
1024                 UNGET (ch2);
1025               if (state == 9 || state == 10)
1026                 state = 3;
1027               PUT (ch);
1028             }
1029           break;
1030
1031         case LEX_IS_STRINGQUOTE:
1032           quotechar = ch;
1033           if (state == 10)
1034             {
1035               /* Preserve the whitespace in foo "bar".  */
1036               UNGET (ch);
1037               state = 3;
1038               PUT (' ');
1039
1040               /* PUT didn't jump out.  We could just break, but we
1041                  know what will happen, so optimize a bit.  */
1042               ch = GET ();
1043               old_state = 3;
1044             }
1045           else if (state == 9)
1046             old_state = 3;
1047           else
1048             old_state = state;
1049           state = 5;
1050           PUT (ch);
1051           break;
1052
1053         case LEX_IS_ONECHAR_QUOTE:
1054 #ifdef H_TICK_HEX
1055           if (state == 9 && enable_h_tick_hex)
1056             {
1057               char c;
1058
1059               c = GET ();
1060               as_warn ("'%c found after symbol", c);
1061               UNGET (c);
1062             }
1063 #endif
1064           if (state == 10)
1065             {
1066               /* Preserve the whitespace in foo 'b'.  */
1067               UNGET (ch);
1068               state = 3;
1069               PUT (' ');
1070               break;
1071             }
1072           ch = GET ();
1073           if (ch == EOF)
1074             {
1075               as_warn (_("end of file after a one-character quote; \\0 inserted"));
1076               ch = 0;
1077             }
1078           if (ch == '\\')
1079             {
1080               ch = GET ();
1081               if (ch == EOF)
1082                 {
1083                   as_warn (_("end of file in escape character"));
1084                   ch = '\\';
1085                 }
1086               else
1087                 ch = process_escape (ch);
1088             }
1089           sprintf (out_buf, "%d", (int) (unsigned char) ch);
1090
1091           /* None of these 'x constants for us.  We want 'x'.  */
1092           if ((ch = GET ()) != '\'')
1093             {
1094 #ifdef REQUIRE_CHAR_CLOSE_QUOTE
1095               as_warn (_("missing close quote; (assumed)"));
1096 #else
1097               if (ch != EOF)
1098                 UNGET (ch);
1099 #endif
1100             }
1101           if (strlen (out_buf) == 1)
1102             {
1103               PUT (out_buf[0]);
1104               break;
1105             }
1106           if (state == 9)
1107             old_state = 3;
1108           else
1109             old_state = state;
1110           state = -1;
1111           out_string = out_buf;
1112           PUT (*out_string++);
1113           break;
1114
1115         case LEX_IS_COLON:
1116 #ifdef KEEP_WHITE_AROUND_COLON
1117           state = 9;
1118 #else
1119           if (state == 9 || state == 10)
1120             state = 3;
1121           else if (state != 3)
1122             state = 1;
1123 #endif
1124           PUT (ch);
1125           break;
1126
1127         case LEX_IS_NEWLINE:
1128           /* Roll out a bunch of newlines from inside comments, etc.  */
1129           if (add_newlines)
1130             {
1131               --add_newlines;
1132               UNGET (ch);
1133             }
1134           /* Fall through.  */
1135
1136         case LEX_IS_LINE_SEPARATOR:
1137           state = 0;
1138           PUT (ch);
1139           break;
1140
1141         case LEX_IS_PARALLEL_SEPARATOR:
1142           state = 1;
1143           PUT (ch);
1144           break;
1145
1146 #ifdef TC_V850
1147         case LEX_IS_DOUBLEDASH_1ST:
1148           ch2 = GET ();
1149           if (ch2 != '-')
1150             {
1151               if (ch2 != EOF)
1152                 UNGET (ch2);
1153               goto de_fault;
1154             }
1155           /* Read and skip to end of line.  */
1156           do
1157             {
1158               ch = GET ();
1159             }
1160           while (ch != EOF && ch != '\n');
1161
1162           if (ch == EOF)
1163             as_warn (_("end of file in comment; newline inserted"));
1164
1165           state = 0;
1166           PUT ('\n');
1167           break;
1168 #endif
1169 #ifdef DOUBLEBAR_PARALLEL
1170         case LEX_IS_DOUBLEBAR_1ST:
1171           ch2 = GET ();
1172           if (ch2 != EOF)
1173             UNGET (ch2);
1174           if (ch2 != '|')
1175             goto de_fault;
1176
1177           /* Handle '||' in two states as invoking PUT twice might
1178              result in the first one jumping out of this loop.  We'd
1179              then lose track of the state and one '|' char.  */
1180           state = 13;
1181           PUT ('|');
1182           break;
1183 #endif
1184         case LEX_IS_LINE_COMMENT_START:
1185           /* FIXME-someday: The two character comment stuff was badly
1186              thought out.  On i386, we want '/' as line comment start
1187              AND we want C style comments.  hence this hack.  The
1188              whole lexical process should be reworked.  xoxorich.  */
1189           if (ch == '/')
1190             {
1191               ch2 = GET ();
1192               if (ch2 == '*')
1193                 {
1194                   old_state = 3;
1195                   state = -2;
1196                   break;
1197                 }
1198               else if (ch2 != EOF)
1199                 {
1200                   UNGET (ch2);
1201                 }
1202             }
1203
1204           if (state == 0 || state == 1) /* Only comment at start of line.  */
1205             {
1206               int startch;
1207
1208               startch = ch;
1209
1210               do
1211                 {
1212                   ch = GET ();
1213                 }
1214               while (ch != EOF && IS_WHITESPACE (ch));
1215
1216               if (ch == EOF)
1217                 {
1218                   as_warn (_("end of file in comment; newline inserted"));
1219                   PUT ('\n');
1220                   break;
1221                 }
1222
1223               if (ch < '0' || ch > '9' || state != 0 || startch != '#')
1224                 {
1225                   /* Not a cpp line.  */
1226                   while (ch != EOF && !IS_NEWLINE (ch))
1227                     ch = GET ();
1228                   if (ch == EOF)
1229                     {
1230                       as_warn (_("end of file in comment; newline inserted"));
1231                       PUT ('\n');
1232                     }
1233                   else /* IS_NEWLINE (ch) */
1234                     {
1235                       /* To process non-zero add_newlines.  */
1236                       UNGET (ch);
1237                     }
1238                   state = 0;
1239                   break;
1240                 }
1241               /* Looks like `# 123 "filename"' from cpp.  */
1242               UNGET (ch);
1243               old_state = 4;
1244               state = -1;
1245               if (scrub_m68k_mri)
1246                 out_string = "\tlinefile ";
1247               else
1248                 out_string = "\t.linefile ";
1249               PUT (*out_string++);
1250               break;
1251             }
1252
1253 #ifdef TC_D10V
1254           /* All insns end in a char for which LEX_IS_SYMBOL_COMPONENT is true.
1255              Trap is the only short insn that has a first operand that is
1256              neither register nor label.
1257              We must prevent exef0f ||trap #1 to degenerate to exef0f ||trap#1 .
1258              We can't make '#' LEX_IS_SYMBOL_COMPONENT because it is
1259              already LEX_IS_LINE_COMMENT_START.  However, it is the
1260              only character in line_comment_chars for d10v, hence we
1261              can recognize it as such.  */
1262           /* An alternative approach would be to reset the state to 1 when
1263              we see '||', '<'- or '->', but that seems to be overkill.  */
1264           if (state == 10)
1265             PUT (' ');
1266 #endif
1267           /* We have a line comment character which is not at the
1268              start of a line.  If this is also a normal comment
1269              character, fall through.  Otherwise treat it as a default
1270              character.  */
1271           if (strchr (tc_comment_chars, ch) == NULL
1272               && (! scrub_m68k_mri
1273                   || (ch != '!' && ch != '*')))
1274             goto de_fault;
1275           if (scrub_m68k_mri
1276               && (ch == '!' || ch == '*' || ch == '#')
1277               && state != 1
1278               && state != 10)
1279             goto de_fault;
1280           /* Fall through.  */
1281         case LEX_IS_COMMENT_START:
1282 #if defined TC_ARM && defined OBJ_ELF
1283           /* On the ARM, `@' is the comment character.
1284              Unfortunately this is also a special character in ELF .symver
1285              directives (and .type, though we deal with those another way).
1286              So we check if this line is such a directive, and treat
1287              the character as default if so.  This is a hack.  */
1288           if ((symver_state != NULL) && (*symver_state == 0))
1289             goto de_fault;
1290 #endif
1291
1292 #ifdef TC_ARM
1293           /* For the ARM, care is needed not to damage occurrences of \@
1294              by stripping the @ onwards.  Yuck.  */
1295           if ((to > tostart ? to[-1] : last_char) == '\\')
1296             /* Do not treat the @ as a start-of-comment.  */
1297             goto de_fault;
1298 #endif
1299
1300 #ifdef WARN_COMMENTS
1301           if (!found_comment)
1302             found_comment_file = as_where (&found_comment);
1303 #endif
1304           do
1305             {
1306               ch = GET ();
1307             }
1308           while (ch != EOF && !IS_NEWLINE (ch));
1309           if (ch == EOF)
1310             as_warn (_("end of file in comment; newline inserted"));
1311           state = 0;
1312           PUT ('\n');
1313           break;
1314
1315 #ifdef H_TICK_HEX
1316         case LEX_IS_H:
1317           /* Look for strings like H'[0-9A-Fa-f] and if found, replace
1318              the H' with 0x to make them gas-style hex characters.  */
1319           if (enable_h_tick_hex)
1320             {
1321               char quot;
1322
1323               quot = GET ();
1324               if (quot == '\'')
1325                 {
1326                   UNGET ('x');
1327                   ch = '0';
1328                 }
1329               else
1330                 UNGET (quot);
1331             }
1332 #endif
1333           /* Fall through.  */
1334
1335         case LEX_IS_SYMBOL_COMPONENT:
1336           if (state == 10)
1337             {
1338               /* This is a symbol character following another symbol
1339                  character, with whitespace in between.  We skipped
1340                  the whitespace earlier, so output it now.  */
1341               UNGET (ch);
1342               state = 3;
1343               PUT (' ');
1344               break;
1345             }
1346
1347 #ifdef TC_Z80
1348           /* "af'" is a symbol containing '\''.  */
1349           if (state == 3 && (ch == 'a' || ch == 'A'))
1350             {
1351               state = 16;
1352               PUT (ch);
1353               ch = GET ();
1354               if (ch == 'f' || ch == 'F')
1355                 {
1356                   state = 17;
1357                   PUT (ch);
1358                   break;
1359                 }
1360               else
1361                 {
1362                   state = 9;
1363                   if (ch == EOF || !IS_SYMBOL_COMPONENT (ch))
1364                     {
1365                       if (ch != EOF)
1366                         UNGET (ch);
1367                       break;
1368                     }
1369                 }
1370             }
1371 #endif
1372           if (state == 3)
1373             state = 9;
1374
1375           /* This is a common case.  Quickly copy CH and all the
1376              following symbol component or normal characters.  */
1377           if (to + 1 < toend
1378               && mri_state == NULL
1379 #if defined TC_ARM && defined OBJ_ELF
1380               && symver_state == NULL
1381 #endif
1382               )
1383             {
1384               char *s;
1385               ptrdiff_t len;
1386
1387               for (s = from; s < fromend; s++)
1388                 {
1389                   int type;
1390
1391                   ch2 = *(unsigned char *) s;
1392                   type = lex[ch2];
1393                   if (type != 0
1394                       && type != LEX_IS_SYMBOL_COMPONENT)
1395                     break;
1396                 }
1397
1398               if (s > from)
1399                 /* Handle the last character normally, for
1400                    simplicity.  */
1401                 --s;
1402
1403               len = s - from;
1404
1405               if (len > (toend - to) - 1)
1406                 len = (toend - to) - 1;
1407
1408               if (len > 0)
1409                 {
1410                   PUT (ch);
1411                   memcpy (to, from, len);
1412                   to += len;
1413                   from += len;
1414                   if (to >= toend)
1415                     goto tofull;
1416                   ch = GET ();
1417                 }
1418             }
1419
1420           /* Fall through.  */
1421         default:
1422         de_fault:
1423           /* Some relatively `normal' character.  */
1424           if (state == 0)
1425             {
1426               state = 11;       /* Now seeing label definition.  */
1427             }
1428           else if (state == 1)
1429             {
1430               state = 2;        /* Ditto.  */
1431             }
1432           else if (state == 9)
1433             {
1434               if (!IS_SYMBOL_COMPONENT (ch))
1435                 state = 3;
1436             }
1437           else if (state == 10)
1438             {
1439               if (ch == '\\')
1440                 {
1441                   /* Special handling for backslash: a backslash may
1442                      be the beginning of a formal parameter (of a
1443                      macro) following another symbol character, with
1444                      whitespace in between.  If that is the case, we
1445                      output a space before the parameter.  Strictly
1446                      speaking, correct handling depends upon what the
1447                      macro parameter expands into; if the parameter
1448                      expands into something which does not start with
1449                      an operand character, then we don't want to keep
1450                      the space.  We don't have enough information to
1451                      make the right choice, so here we are making the
1452                      choice which is more likely to be correct.  */
1453                   if (to + 1 >= toend)
1454                     {
1455                       /* If we're near the end of the buffer, save the
1456                          character for the next time round.  Otherwise
1457                          we'll lose our state.  */
1458                       UNGET (ch);
1459                       goto tofull;
1460                     }
1461                   *to++ = ' ';
1462                 }
1463
1464               state = 3;
1465             }
1466           PUT (ch);
1467           break;
1468         }
1469     }
1470
1471   /*NOTREACHED*/
1472
1473  fromeof:
1474   /* We have reached the end of the input.  */
1475 #ifdef TC_ARM
1476   if (to > tostart)
1477     last_char = to[-1];
1478 #endif
1479   return to - tostart;
1480
1481  tofull:
1482   /* The output buffer is full.  Save any input we have not yet
1483      processed.  */
1484   if (fromend > from)
1485     {
1486       saved_input = from;
1487       saved_input_len = fromend - from;
1488     }
1489   else
1490     saved_input = NULL;
1491
1492 #ifdef TC_ARM
1493   if (to > tostart)
1494     last_char = to[-1];
1495 #endif
1496   return to - tostart;
1497 }