gas/app.c

   1 /* This is the Assembler Pre-Processor
   2    Copyright 1987, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998,
   3    1999, 2000, 2001, 2002, 2003, 2005, 2006, 2007, 2008, 2009, 2010, 2012
   4    Free Software Foundation, Inc.
   5
   6    This file is part of GAS, the GNU Assembler.
   7
   8    GAS is free software; you can redistribute it and/or modify
   9    it under the terms of the GNU General Public License as published by
  10    the Free Software Foundation; either version 3, or (at your option)
  11    any later version.
  12
  13    GAS is distributed in the hope that it will be useful, but WITHOUT
  14    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  15    or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
  16    License for more details.
  17
  18    You should have received a copy of the GNU General Public License
  19    along with GAS; see the file COPYING.  If not, write to the Free
  20    Software Foundation, 51 Franklin Street - Fifth Floor, Boston, MA
  21    02110-1301, USA.  */
  22
  23 /* Modified by Allen Wirfs-Brock, Instantiations Inc 2/90.  */
  24 /* App, the assembler pre-processor.  This pre-processor strips out
  25    excess spaces, turns single-quoted characters into a decimal
  26    constant, and turns the # in # <number> <filename> <garbage> into a
  27    .linefile.  This needs better error-handling.  */
  28
  29 #include "as.h"
  30
  31 #if (__STDC__ != 1)
  32 #ifndef const
  33 #define const  /* empty */
  34 #endif
  35 #endif
  36
  37 #ifdef H_TICK_HEX
  38 int enable_h_tick_hex = 0;
  39 #endif
  40
  41 #ifdef TC_M68K
  42 /* Whether we are scrubbing in m68k MRI mode.  This is different from
  43    flag_m68k_mri, because the two flags will be affected by the .mri
  44    pseudo-op at different times.  */
  45 static int scrub_m68k_mri;
  46
  47 /* The pseudo-op which switches in and out of MRI mode.  See the
  48    comment in do_scrub_chars.  */
  49 static const char mri_pseudo[] = ".mri 0";
  50 #else
  51 #define scrub_m68k_mri 0
  52 #endif
  53
  54 #if defined TC_ARM && defined OBJ_ELF
  55 /* The pseudo-op for which we need to special-case `@' characters.
  56    See the comment in do_scrub_chars.  */
  57 static const char   symver_pseudo[] = ".symver";
  58 static const char * symver_state;
  59 #endif
  60
  61 static char lex[256];
  62 static const char symbol_chars[] =
  63 "$._ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
  64
  65 #define LEX_IS_SYMBOL_COMPONENT         1
  66 #define LEX_IS_WHITESPACE               2
  67 #define LEX_IS_LINE_SEPARATOR           3
  68 #define LEX_IS_COMMENT_START            4
  69 #define LEX_IS_LINE_COMMENT_START       5
  70 #define LEX_IS_TWOCHAR_COMMENT_1ST      6
  71 #define LEX_IS_STRINGQUOTE              8
  72 #define LEX_IS_COLON                    9
  73 #define LEX_IS_NEWLINE                  10
  74 #define LEX_IS_ONECHAR_QUOTE            11
  75 #ifdef TC_V850
  76 #define LEX_IS_DOUBLEDASH_1ST           12
  77 #endif
  78 #ifdef TC_M32R
  79 #define DOUBLEBAR_PARALLEL
  80 #endif
  81 #ifdef DOUBLEBAR_PARALLEL
  82 #define LEX_IS_DOUBLEBAR_1ST            13
  83 #endif
  84 #define LEX_IS_PARALLEL_SEPARATOR       14
  85 #ifdef H_TICK_HEX
  86 #define LEX_IS_H                        15
  87 #endif
  88 #define IS_SYMBOL_COMPONENT(c)          (lex[c] == LEX_IS_SYMBOL_COMPONENT)
  89 #define IS_WHITESPACE(c)                (lex[c] == LEX_IS_WHITESPACE)
  90 #define IS_LINE_SEPARATOR(c)            (lex[c] == LEX_IS_LINE_SEPARATOR)
  91 #define IS_PARALLEL_SEPARATOR(c)        (lex[c] == LEX_IS_PARALLEL_SEPARATOR)
  92 #define IS_COMMENT(c)                   (lex[c] == LEX_IS_COMMENT_START)
  93 #define IS_LINE_COMMENT(c)              (lex[c] == LEX_IS_LINE_COMMENT_START)
  94 #define IS_NEWLINE(c)                   (lex[c] == LEX_IS_NEWLINE)
  95
  96 static int process_escape (int);
  97
  98 /* FIXME-soon: The entire lexer/parser thingy should be
  99    built statically at compile time rather than dynamically
 100    each and every time the assembler is run.  xoxorich.  */
 101
 102 void
 103 do_scrub_begin (int m68k_mri ATTRIBUTE_UNUSED)
 104 {
 105   const char *p;
 106   int c;
 107
 108   lex[' '] = LEX_IS_WHITESPACE;
 109   lex['\t'] = LEX_IS_WHITESPACE;
 110   lex['\r'] = LEX_IS_WHITESPACE;
 111   lex['\n'] = LEX_IS_NEWLINE;
 112   lex[':'] = LEX_IS_COLON;
 113
 114 #ifdef TC_M68K
 115   scrub_m68k_mri = m68k_mri;
 116
 117   if (! m68k_mri)
 118 #endif
 119     {
 120       lex['"'] = LEX_IS_STRINGQUOTE;
 121
 122 #if ! defined (TC_HPPA) && ! defined (TC_I370)
 123       /* I370 uses single-quotes to delimit integer, float constants.  */
 124       lex['\''] = LEX_IS_ONECHAR_QUOTE;
 125 #endif
 126
 127 #ifdef SINGLE_QUOTE_STRINGS
 128       lex['\''] = LEX_IS_STRINGQUOTE;
 129 #endif
 130     }
 131
 132   /* Note: if any other character can be LEX_IS_STRINGQUOTE, the loop
 133      in state 5 of do_scrub_chars must be changed.  */
 134
 135   /* Note that these override the previous defaults, e.g. if ';' is a
 136      comment char, then it isn't a line separator.  */
 137   for (p = symbol_chars; *p; ++p)
 138     lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
 139
 140   for (c = 128; c < 256; ++c)
 141     lex[c] = LEX_IS_SYMBOL_COMPONENT;
 142
 143 #ifdef tc_symbol_chars
 144   /* This macro permits the processor to specify all characters which
 145      may appears in an operand.  This will prevent the scrubber from
 146      discarding meaningful whitespace in certain cases.  The i386
 147      backend uses this to support prefixes, which can confuse the
 148      scrubber as to whether it is parsing operands or opcodes.  */
 149   for (p = tc_symbol_chars; *p; ++p)
 150     lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
 151 #endif
 152
 153   /* The m68k backend wants to be able to change comment_chars.  */
 154 #ifndef tc_comment_chars
 155 #define tc_comment_chars comment_chars
 156 #endif
 157   for (p = tc_comment_chars; *p; p++)
 158     lex[(unsigned char) *p] = LEX_IS_COMMENT_START;
 159
 160   for (p = line_comment_chars; *p; p++)
 161     lex[(unsigned char) *p] = LEX_IS_LINE_COMMENT_START;
 162
 163   for (p = line_separator_chars; *p; p++)
 164     lex[(unsigned char) *p] = LEX_IS_LINE_SEPARATOR;
 165
 166 #ifdef tc_parallel_separator_chars
 167   /* This macro permits the processor to specify all characters which
 168      separate parallel insns on the same line.  */
 169   for (p = tc_parallel_separator_chars; *p; p++)
 170     lex[(unsigned char) *p] = LEX_IS_PARALLEL_SEPARATOR;
 171 #endif
 172
 173   /* Only allow slash-star comments if slash is not in use.
 174      FIXME: This isn't right.  We should always permit them.  */
 175   if (lex['/'] == 0)
 176     lex['/'] = LEX_IS_TWOCHAR_COMMENT_1ST;
 177
 178 #ifdef TC_M68K
 179   if (m68k_mri)
 180     {
 181       lex['\''] = LEX_IS_STRINGQUOTE;
 182       lex[';'] = LEX_IS_COMMENT_START;
 183       lex['*'] = LEX_IS_LINE_COMMENT_START;
 184       /* The MRI documentation says '!' is LEX_IS_COMMENT_START, but
 185          then it can't be used in an expression.  */
 186       lex['!'] = LEX_IS_LINE_COMMENT_START;
 187     }
 188 #endif
 189
 190 #ifdef TC_V850
 191   lex['-'] = LEX_IS_DOUBLEDASH_1ST;
 192 #endif
 193 #ifdef DOUBLEBAR_PARALLEL
 194   lex['|'] = LEX_IS_DOUBLEBAR_1ST;
 195 #endif
 196 #ifdef TC_D30V
 197   /* Must do this is we want VLIW instruction with "->" or "<-".  */
 198   lex['-'] = LEX_IS_SYMBOL_COMPONENT;
 199 #endif
 200
 201 #ifdef H_TICK_HEX
 202   if (enable_h_tick_hex)
 203     {
 204       lex['h'] = LEX_IS_H;
 205       lex['H'] = LEX_IS_H;
 206     }
 207 #endif
 208 }
 209
 210 /* Saved state of the scrubber.  */
 211 static int state;
 212 static int old_state;
 213 static char *out_string;
 214 static char out_buf[20];
 215 static int add_newlines;
 216 static char *saved_input;
 217 static size_t saved_input_len;
 218 static char input_buffer[32 * 1024];
 219 static const char *mri_state;
 220 static char mri_last_ch;
 221
 222 /* Data structure for saving the state of app across #include's.  Note that
 223    app is called asynchronously to the parsing of the .include's, so our
 224    state at the time .include is interpreted is completely unrelated.
 225    That's why we have to save it all.  */
 226
 227 struct app_save
 228 {
 229   int          state;
 230   int          old_state;
 231   char *       out_string;
 232   char         out_buf[sizeof (out_buf)];
 233   int          add_newlines;
 234   char *       saved_input;
 235   size_t       saved_input_len;
 236 #ifdef TC_M68K
 237   int          scrub_m68k_mri;
 238 #endif
 239   const char * mri_state;
 240   char         mri_last_ch;
 241 #if defined TC_ARM && defined OBJ_ELF
 242   const char * symver_state;
 243 #endif
 244 };
 245
 246 char *
 247 app_push (void)
 248 {
 249   register struct app_save *saved;
 250
 251   saved = (struct app_save *) xmalloc (sizeof (*saved));
 252   saved->state = state;
 253   saved->old_state = old_state;
 254   saved->out_string = out_string;
 255   memcpy (saved->out_buf, out_buf, sizeof (out_buf));
 256   saved->add_newlines = add_newlines;
 257   if (saved_input == NULL)
 258     saved->saved_input = NULL;
 259   else
 260     {
 261       saved->saved_input = (char *) xmalloc (saved_input_len);
 262       memcpy (saved->saved_input, saved_input, saved_input_len);
 263       saved->saved_input_len = saved_input_len;
 264     }
 265 #ifdef TC_M68K
 266   saved->scrub_m68k_mri = scrub_m68k_mri;
 267 #endif
 268   saved->mri_state = mri_state;
 269   saved->mri_last_ch = mri_last_ch;
 270 #if defined TC_ARM && defined OBJ_ELF
 271   saved->symver_state = symver_state;
 272 #endif
 273
 274   /* do_scrub_begin() is not useful, just wastes time.  */
 275
 276   state = 0;
 277   saved_input = NULL;
 278   add_newlines = 0;
 279
 280   return (char *) saved;
 281 }
 282
 283 void
 284 app_pop (char *arg)
 285 {
 286   register struct app_save *saved = (struct app_save *) arg;
 287
 288   /* There is no do_scrub_end ().  */
 289   state = saved->state;
 290   old_state = saved->old_state;
 291   out_string = saved->out_string;
 292   memcpy (out_buf, saved->out_buf, sizeof (out_buf));
 293   add_newlines = saved->add_newlines;
 294   if (saved->saved_input == NULL)
 295     saved_input = NULL;
 296   else
 297     {
 298       gas_assert (saved->saved_input_len <= sizeof (input_buffer));
 299       memcpy (input_buffer, saved->saved_input, saved->saved_input_len);
 300       saved_input = input_buffer;
 301       saved_input_len = saved->saved_input_len;
 302       free (saved->saved_input);
 303     }
 304 #ifdef TC_M68K
 305   scrub_m68k_mri = saved->scrub_m68k_mri;
 306 #endif
 307   mri_state = saved->mri_state;
 308   mri_last_ch = saved->mri_last_ch;
 309 #if defined TC_ARM && defined OBJ_ELF
 310   symver_state = saved->symver_state;
 311 #endif
 312
 313   free (arg);
 314 }
 315
 316 /* @@ This assumes that \n &c are the same on host and target.  This is not
 317    necessarily true.  */
 318
 319 static int
 320 process_escape (int ch)
 321 {
 322   switch (ch)
 323     {
 324     case 'b':
 325       return '\b';
 326     case 'f':
 327       return '\f';
 328     case 'n':
 329       return '\n';
 330     case 'r':
 331       return '\r';
 332     case 't':
 333       return '\t';
 334     case '\'':
 335       return '\'';
 336     case '"':
 337       return '\"';
 338     default:
 339       return ch;
 340     }
 341 }
 342
 343 /* This function is called to process input characters.  The GET
 344    parameter is used to retrieve more input characters.  GET should
 345    set its parameter to point to a buffer, and return the length of
 346    the buffer; it should return 0 at end of file.  The scrubbed output
 347    characters are put into the buffer starting at TOSTART; the TOSTART
 348    buffer is TOLEN bytes in length.  The function returns the number
 349    of scrubbed characters put into TOSTART.  This will be TOLEN unless
 350    end of file was seen.  This function is arranged as a state
 351    machine, and saves its state so that it may return at any point.
 352    This is the way the old code used to work.  */
 353
 354 size_t
 355 do_scrub_chars (size_t (*get) (char *, size_t), char *tostart, size_t tolen)
 356 {
 357   char *to = tostart;
 358   char *toend = tostart + tolen;
 359   char *from;
 360   char *fromend;
 361   size_t fromlen;
 362   register int ch, ch2 = 0;
 363   /* Character that started the string we're working on.  */
 364   static char quotechar;
 365
 366   /*State 0: beginning of normal line
 367           1: After first whitespace on line (flush more white)
 368           2: After first non-white (opcode) on line (keep 1white)
 369           3: after second white on line (into operands) (flush white)
 370           4: after putting out a .linefile, put out digits
 371           5: parsing a string, then go to old-state
 372           6: putting out \ escape in a "d string.
 373           7: no longer used
 374           8: no longer used
 375           9: After seeing symbol char in state 3 (keep 1white after symchar)
 376          10: After seeing whitespace in state 9 (keep white before symchar)
 377          11: After seeing a symbol character in state 0 (eg a label definition)
 378          -1: output string in out_string and go to the state in old_state
 379          -2: flush text until a '*' '/' is seen, then go to state old_state
 380 #ifdef TC_V850
 381          12: After seeing a dash, looking for a second dash as a start
 382              of comment.
 383 #endif
 384 #ifdef DOUBLEBAR_PARALLEL
 385          13: After seeing a vertical bar, looking for a second
 386              vertical bar as a parallel expression separator.
 387 #endif
 388 #ifdef TC_PREDICATE_START_CHAR
 389          14: After seeing a predicate start character at state 0, looking
 390              for a predicate end character as predicate.
 391          15: After seeing a predicate start character at state 1, looking
 392              for a predicate end character as predicate.
 393 #endif
 394 #ifdef TC_Z80
 395          16: After seeing an 'a' or an 'A' at the start of a symbol
 396          17: After seeing an 'f' or an 'F' in state 16
 397 #endif
 398           */
 399
 400   /* I added states 9 and 10 because the MIPS ECOFF assembler uses
 401      constructs like ``.loc 1 20''.  This was turning into ``.loc
 402      120''.  States 9 and 10 ensure that a space is never dropped in
 403      between characters which could appear in an identifier.  Ian
 404      Taylor, ian@cygnus.com.
 405
 406      I added state 11 so that something like "Lfoo add %r25,%r26,%r27" works
 407      correctly on the PA (and any other target where colons are optional).
 408      Jeff Law, law@cs.utah.edu.
 409
 410      I added state 13 so that something like "cmp r1, r2 || trap #1" does not
 411      get squashed into "cmp r1,r2||trap#1", with the all important space
 412      between the 'trap' and the '#1' being eliminated.  nickc@cygnus.com  */
 413
 414   /* This macro gets the next input character.  */
 415
 416 #define GET()                                                   \
 417   (from < fromend                                               \
 418    ? * (unsigned char *) (from++)                               \
 419    : (saved_input = NULL,                                       \
 420       fromlen = (*get) (input_buffer, sizeof input_buffer),     \
 421       from = input_buffer,                                      \
 422       fromend = from + fromlen,                                 \
 423       (fromlen == 0                                             \
 424        ? EOF                                                    \
 425        : * (unsigned char *) (from++))))
 426
 427   /* This macro pushes a character back on the input stream.  */
 428
 429 #define UNGET(uch) (*--from = (uch))
 430
 431   /* This macro puts a character into the output buffer.  If this
 432      character fills the output buffer, this macro jumps to the label
 433      TOFULL.  We use this rather ugly approach because we need to
 434      handle two different termination conditions: EOF on the input
 435      stream, and a full output buffer.  It would be simpler if we
 436      always read in the entire input stream before processing it, but
 437      I don't want to make such a significant change to the assembler's
 438      memory usage.  */
 439
 440 #define PUT(pch)                                \
 441   do                                            \
 442     {                                           \
 443       *to++ = (pch);                            \
 444       if (to >= toend)                          \
 445         goto tofull;                            \
 446     }                                           \
 447   while (0)
 448
 449   if (saved_input != NULL)
 450     {
 451       from = saved_input;
 452       fromend = from + saved_input_len;
 453     }
 454   else
 455     {
 456       fromlen = (*get) (input_buffer, sizeof input_buffer);
 457       if (fromlen == 0)
 458         return 0;
 459       from = input_buffer;
 460       fromend = from + fromlen;
 461     }
 462
 463   while (1)
 464     {
 465       /* The cases in this switch end with continue, in order to
 466          branch back to the top of this while loop and generate the
 467          next output character in the appropriate state.  */
 468       switch (state)
 469         {
 470         case -1:
 471           ch = *out_string++;
 472           if (*out_string == '\0')
 473             {
 474               state = old_state;
 475               old_state = 3;
 476             }
 477           PUT (ch);
 478           continue;
 479
 480         case -2:
 481           for (;;)
 482             {
 483               do
 484                 {
 485                   ch = GET ();
 486
 487                   if (ch == EOF)
 488                     {
 489                       as_warn (_("end of file in comment"));
 490                       goto fromeof;
 491                     }
 492
 493                   if (ch == '\n')
 494                     PUT ('\n');
 495                 }
 496               while (ch != '*');
 497
 498               while ((ch = GET ()) == '*')
 499                 ;
 500
 501               if (ch == EOF)
 502                 {
 503                   as_warn (_("end of file in comment"));
 504                   goto fromeof;
 505                 }
 506
 507               if (ch == '/')
 508                 break;
 509
 510               UNGET (ch);
 511             }
 512
 513           state = old_state;
 514           UNGET (' ');
 515           continue;
 516
 517         case 4:
 518           ch = GET ();
 519           if (ch == EOF)
 520             goto fromeof;
 521           else if (ch >= '0' && ch <= '9')
 522             PUT (ch);
 523           else
 524             {
 525               while (ch != EOF && IS_WHITESPACE (ch))
 526                 ch = GET ();
 527               if (ch == '"')
 528                 {
 529                   quotechar = ch;
 530                   state = 5;
 531                   old_state = 3;
 532                   PUT (ch);
 533                 }
 534               else
 535                 {
 536                   while (ch != EOF && ch != '\n')
 537                     ch = GET ();
 538                   state = 0;
 539                   PUT (ch);
 540                 }
 541             }
 542           continue;
 543
 544         case 5:
 545           /* We are going to copy everything up to a quote character,
 546              with special handling for a backslash.  We try to
 547              optimize the copying in the simple case without using the
 548              GET and PUT macros.  */
 549           {
 550             char *s;
 551             ptrdiff_t len;
 552
 553             for (s = from; s < fromend; s++)
 554               {
 555                 ch = *s;
 556                 if (ch == '\\'
 557                     || ch == quotechar
 558                     || ch == '\n')
 559                   break;
 560               }
 561             len = s - from;
 562             if (len > toend - to)
 563               len = toend - to;
 564             if (len > 0)
 565               {
 566                 memcpy (to, from, len);
 567                 to += len;
 568                 from += len;
 569                 if (to >= toend)
 570                   goto tofull;
 571               }
 572           }
 573
 574           ch = GET ();
 575           if (ch == EOF)
 576             {
 577               /* This buffer is here specifically so
 578                  that the UNGET below will work.  */
 579               static char one_char_buf[1];
 580
 581               as_warn (_("end of file in string; '%c' inserted"), quotechar);
 582               state = old_state;
 583               from = fromend = one_char_buf + 1;
 584               fromlen = 1;
 585               UNGET ('\n');
 586               PUT (quotechar);
 587             }
 588           else if (ch == quotechar)
 589             {
 590               state = old_state;
 591               PUT (ch);
 592             }
 593 #ifndef NO_STRING_ESCAPES
 594           else if (ch == '\\')
 595             {
 596               state = 6;
 597               PUT (ch);
 598             }
 599 #endif
 600           else if (scrub_m68k_mri && ch == '\n')
 601             {
 602               /* Just quietly terminate the string.  This permits lines like
 603                    bne  label   loop if we haven't reach end yet.  */
 604               state = old_state;
 605               UNGET (ch);
 606               PUT ('\'');
 607             }
 608           else
 609             {
 610               PUT (ch);
 611             }
 612           continue;
 613
 614         case 6:
 615           state = 5;
 616           ch = GET ();
 617           switch (ch)
 618             {
 619               /* Handle strings broken across lines, by turning '\n' into
 620                  '\\' and 'n'.  */
 621             case '\n':
 622               UNGET ('n');
 623               add_newlines++;
 624               PUT ('\\');
 625               continue;
 626
 627             case EOF:
 628               as_warn (_("end of file in string; '%c' inserted"), quotechar);
 629               PUT (quotechar);
 630               continue;
 631
 632             case '"':
 633             case '\\':
 634             case 'b':
 635             case 'f':
 636             case 'n':
 637             case 'r':
 638             case 't':
 639             case 'v':
 640             case 'x':
 641             case 'X':
 642             case '0':
 643             case '1':
 644             case '2':
 645             case '3':
 646             case '4':
 647             case '5':
 648             case '6':
 649             case '7':
 650               break;
 651
 652             default:
 653 #ifdef ONLY_STANDARD_ESCAPES
 654               as_warn (_("unknown escape '\\%c' in string; ignored"), ch);
 655 #endif
 656               break;
 657             }
 658           PUT (ch);
 659           continue;
 660
 661 #ifdef DOUBLEBAR_PARALLEL
 662         case 13:
 663           ch = GET ();
 664           if (ch != '|')
 665             abort ();
 666
 667           /* Reset back to state 1 and pretend that we are parsing a
 668              line from just after the first white space.  */
 669           state = 1;
 670           PUT ('|');
 671 #ifdef TC_TIC6X
 672           /* "||^" is used for SPMASKed instructions.  */
 673           ch = GET ();
 674           if (ch == EOF)
 675             goto fromeof;
 676           else if (ch == '^')
 677             PUT ('^');
 678           else
 679             UNGET (ch);
 680 #endif
 681           continue;
 682 #endif
 683 #ifdef TC_Z80
 684         case 16:
 685           /* We have seen an 'a' at the start of a symbol, look for an 'f'.  */
 686           ch = GET ();
 687           if (ch == 'f' || ch == 'F')
 688             {
 689               state = 17;
 690               PUT (ch);
 691             }
 692           else
 693             {
 694               state = 9;
 695               break;
 696             }
 697         case 17:
 698           /* We have seen "af" at the start of a symbol,
 699              a ' here is a part of that symbol.  */
 700           ch = GET ();
 701           state = 9;
 702           if (ch == '\'')
 703             /* Change to avoid warning about unclosed string.  */
 704             PUT ('`');
 705           else if (ch != EOF)
 706             UNGET (ch);
 707           break;
 708 #endif
 709         }
 710
 711       /* OK, we are somewhere in states 0 through 4 or 9 through 11.  */
 712
 713       /* flushchar: */
 714       ch = GET ();
 715
 716 #ifdef TC_PREDICATE_START_CHAR
 717       if (ch == TC_PREDICATE_START_CHAR && (state == 0 || state == 1))
 718         {
 719           state += 14;
 720           PUT (ch);
 721           continue;
 722         }
 723       else if (state == 14 || state == 15)
 724         {
 725           if (ch == TC_PREDICATE_END_CHAR)
 726             {
 727               state -= 14;
 728               PUT (ch);
 729               ch = GET ();
 730             }
 731           else
 732             {
 733               PUT (ch);
 734               continue;
 735             }
 736         }
 737 #endif
 738
 739     recycle:
 740
 741 #if defined TC_ARM && defined OBJ_ELF
 742       /* We need to watch out for .symver directives.  See the comment later
 743          in this function.  */
 744       if (symver_state == NULL)
 745         {
 746           if ((state == 0 || state == 1) && ch == symver_pseudo[0])
 747             symver_state = symver_pseudo + 1;
 748         }
 749       else
 750         {
 751           /* We advance to the next state if we find the right
 752              character.  */
 753           if (ch != '\0' && (*symver_state == ch))
 754             ++symver_state;
 755           else if (*symver_state != '\0')
 756             /* We did not get the expected character, or we didn't
 757                get a valid terminating character after seeing the
 758                entire pseudo-op, so we must go back to the beginning.  */
 759             symver_state = NULL;
 760           else
 761             {
 762               /* We've read the entire pseudo-op.  If this is the end
 763                  of the line, go back to the beginning.  */
 764               if (IS_NEWLINE (ch))
 765                 symver_state = NULL;
 766             }
 767         }
 768 #endif /* TC_ARM && OBJ_ELF */
 769
 770 #ifdef TC_M68K
 771       /* We want to have pseudo-ops which control whether we are in
 772          MRI mode or not.  Unfortunately, since m68k MRI mode affects
 773          the scrubber, that means that we need a special purpose
 774          recognizer here.  */
 775       if (mri_state == NULL)
 776         {
 777           if ((state == 0 || state == 1)
 778               && ch == mri_pseudo[0])
 779             mri_state = mri_pseudo + 1;
 780         }
 781       else
 782         {
 783           /* We advance to the next state if we find the right
 784              character, or if we need a space character and we get any
 785              whitespace character, or if we need a '0' and we get a
 786              '1' (this is so that we only need one state to handle
 787              ``.mri 0'' and ``.mri 1'').  */
 788           if (ch != '\0'
 789               && (*mri_state == ch
 790                   || (*mri_state == ' '
 791                       && lex[ch] == LEX_IS_WHITESPACE)
 792                   || (*mri_state == '0'
 793                       && ch == '1')))
 794             {
 795               mri_last_ch = ch;
 796               ++mri_state;
 797             }
 798           else if (*mri_state != '\0'
 799                    || (lex[ch] != LEX_IS_WHITESPACE
 800                        && lex[ch] != LEX_IS_NEWLINE))
 801             {
 802               /* We did not get the expected character, or we didn't
 803                  get a valid terminating character after seeing the
 804                  entire pseudo-op, so we must go back to the
 805                  beginning.  */
 806               mri_state = NULL;
 807             }
 808           else
 809             {
 810               /* We've read the entire pseudo-op.  mips_last_ch is
 811                  either '0' or '1' indicating whether to enter or
 812                  leave MRI mode.  */
 813               do_scrub_begin (mri_last_ch == '1');
 814               mri_state = NULL;
 815
 816               /* We continue handling the character as usual.  The
 817                  main gas reader must also handle the .mri pseudo-op
 818                  to control expression parsing and the like.  */
 819             }
 820         }
 821 #endif
 822
 823       if (ch == EOF)
 824         {
 825           if (state != 0)
 826             {
 827               as_warn (_("end of file not at end of a line; newline inserted"));
 828               state = 0;
 829               PUT ('\n');
 830             }
 831           goto fromeof;
 832         }
 833
 834       switch (lex[ch])
 835         {
 836         case LEX_IS_WHITESPACE:
 837           do
 838             {
 839               ch = GET ();
 840             }
 841           while (ch != EOF && IS_WHITESPACE (ch));
 842           if (ch == EOF)
 843             goto fromeof;
 844
 845           if (state == 0)
 846             {
 847               /* Preserve a single whitespace character at the
 848                  beginning of a line.  */
 849               state = 1;
 850               UNGET (ch);
 851               PUT (' ');
 852               break;
 853             }
 854
 855 #ifdef KEEP_WHITE_AROUND_COLON
 856           if (lex[ch] == LEX_IS_COLON)
 857             {
 858               /* Only keep this white if there's no white *after* the
 859                  colon.  */
 860               ch2 = GET ();
 861               if (ch2 != EOF)
 862                 UNGET (ch2);
 863               if (!IS_WHITESPACE (ch2))
 864                 {
 865                   state = 9;
 866                   UNGET (ch);
 867                   PUT (' ');
 868                   break;
 869                 }
 870             }
 871 #endif
 872           if (IS_COMMENT (ch)
 873               || ch == '/'
 874               || IS_LINE_SEPARATOR (ch)
 875               || IS_PARALLEL_SEPARATOR (ch))
 876             {
 877               if (scrub_m68k_mri)
 878                 {
 879                   /* In MRI mode, we keep these spaces.  */
 880                   UNGET (ch);
 881                   PUT (' ');
 882                   break;
 883                 }
 884               goto recycle;
 885             }
 886
 887           /* If we're in state 2 or 11, we've seen a non-white
 888              character followed by whitespace.  If the next character
 889              is ':', this is whitespace after a label name which we
 890              normally must ignore.  In MRI mode, though, spaces are
 891              not permitted between the label and the colon.  */
 892           if ((state == 2 || state == 11)
 893               && lex[ch] == LEX_IS_COLON
 894               && ! scrub_m68k_mri)
 895             {
 896               state = 1;
 897               PUT (ch);
 898               break;
 899             }
 900
 901           switch (state)
 902             {
 903             case 1:
 904               /* We can arrive here if we leave a leading whitespace
 905                  character at the beginning of a line.  */
 906               goto recycle;
 907             case 2:
 908               state = 3;
 909               if (to + 1 < toend)
 910                 {
 911                   /* Optimize common case by skipping UNGET/GET.  */
 912                   PUT (' ');    /* Sp after opco */
 913                   goto recycle;
 914                 }
 915               UNGET (ch);
 916               PUT (' ');
 917               break;
 918             case 3:
 919 #ifndef TC_KEEP_OPERAND_SPACES
 920               /* For TI C6X, we keep these spaces as they may separate
 921                  functional unit specifiers from operands.  */
 922               if (scrub_m68k_mri)
 923 #endif
 924                 {
 925                   /* In MRI mode, we keep these spaces.  */
 926                   UNGET (ch);
 927                   PUT (' ');
 928                   break;
 929                 }
 930               goto recycle;     /* Sp in operands */
 931             case 9:
 932             case 10:
 933 #ifndef TC_KEEP_OPERAND_SPACES
 934               if (scrub_m68k_mri)
 935 #endif
 936                 {
 937                   /* In MRI mode, we keep these spaces.  */
 938                   state = 3;
 939                   UNGET (ch);
 940                   PUT (' ');
 941                   break;
 942                 }
 943               state = 10;       /* Sp after symbol char */
 944               goto recycle;
 945             case 11:
 946               if (LABELS_WITHOUT_COLONS || flag_m68k_mri)
 947                 state = 1;
 948               else
 949                 {
 950                   /* We know that ch is not ':', since we tested that
 951                      case above.  Therefore this is not a label, so it
 952                      must be the opcode, and we've just seen the
 953                      whitespace after it.  */
 954                   state = 3;
 955                 }
 956               UNGET (ch);
 957               PUT (' ');        /* Sp after label definition.  */
 958               break;
 959             default:
 960               BAD_CASE (state);
 961             }
 962           break;
 963
 964         case LEX_IS_TWOCHAR_COMMENT_1ST:
 965           ch2 = GET ();
 966           if (ch2 == '*')
 967             {
 968               for (;;)
 969                 {
 970                   do
 971                     {
 972                       ch2 = GET ();
 973                       if (ch2 != EOF && IS_NEWLINE (ch2))
 974                         add_newlines++;
 975                     }
 976                   while (ch2 != EOF && ch2 != '*');
 977
 978                   while (ch2 == '*')
 979                     ch2 = GET ();
 980
 981                   if (ch2 == EOF || ch2 == '/')
 982                     break;
 983
 984                   /* This UNGET will ensure that we count newlines
 985                      correctly.  */
 986                   UNGET (ch2);
 987                 }
 988
 989               if (ch2 == EOF)
 990                 as_warn (_("end of file in multiline comment"));
 991
 992               ch = ' ';
 993               goto recycle;
 994             }
 995 #ifdef DOUBLESLASH_LINE_COMMENTS
 996           else if (ch2 == '/')
 997             {
 998               do
 999                 {
1000                   ch = GET ();
1001                 }
1002               while (ch != EOF && !IS_NEWLINE (ch));
1003               if (ch == EOF)
1004                 as_warn ("end of file in comment; newline inserted");
1005               state = 0;
1006               PUT ('\n');
1007               break;
1008             }
1009 #endif
1010           else
1011             {
1012               if (ch2 != EOF)
1013                 UNGET (ch2);
1014               if (state == 9 || state == 10)
1015                 state = 3;
1016               PUT (ch);
1017             }
1018           break;
1019
1020         case LEX_IS_STRINGQUOTE:
1021           quotechar = ch;
1022           if (state == 10)
1023             {
1024               /* Preserve the whitespace in foo "bar".  */
1025               UNGET (ch);
1026               state = 3;
1027               PUT (' ');
1028
1029               /* PUT didn't jump out.  We could just break, but we
1030                  know what will happen, so optimize a bit.  */
1031               ch = GET ();
1032               old_state = 3;
1033             }
1034           else if (state == 9)
1035             old_state = 3;
1036           else
1037             old_state = state;
1038           state = 5;
1039           PUT (ch);
1040           break;
1041
1042 #ifndef IEEE_STYLE
1043         case LEX_IS_ONECHAR_QUOTE:
1044 #ifdef H_TICK_HEX
1045           if (state == 9 && enable_h_tick_hex)
1046             {
1047               char c;
1048
1049               c = GET ();
1050               as_warn ("'%c found after symbol", c);
1051               UNGET (c);
1052             }
1053 #endif
1054           if (state == 10)
1055             {
1056               /* Preserve the whitespace in foo 'b'.  */
1057               UNGET (ch);
1058               state = 3;
1059               PUT (' ');
1060               break;
1061             }
1062           ch = GET ();
1063           if (ch == EOF)
1064             {
1065               as_warn (_("end of file after a one-character quote; \\0 inserted"));
1066               ch = 0;
1067             }
1068           if (ch == '\\')
1069             {
1070               ch = GET ();
1071               if (ch == EOF)
1072                 {
1073                   as_warn (_("end of file in escape character"));
1074                   ch = '\\';
1075                 }
1076               else
1077                 ch = process_escape (ch);
1078             }
1079           sprintf (out_buf, "%d", (int) (unsigned char) ch);
1080
1081           /* None of these 'x constants for us.  We want 'x'.  */
1082           if ((ch = GET ()) != '\'')
1083             {
1084 #ifdef REQUIRE_CHAR_CLOSE_QUOTE
1085               as_warn (_("missing close quote; (assumed)"));
1086 #else
1087               if (ch != EOF)
1088                 UNGET (ch);
1089 #endif
1090             }
1091           if (strlen (out_buf) == 1)
1092             {
1093               PUT (out_buf[0]);
1094               break;
1095             }
1096           if (state == 9)
1097             old_state = 3;
1098           else
1099             old_state = state;
1100           state = -1;
1101           out_string = out_buf;
1102           PUT (*out_string++);
1103           break;
1104 #endif
1105
1106         case LEX_IS_COLON:
1107 #ifdef KEEP_WHITE_AROUND_COLON
1108           state = 9;
1109 #else
1110           if (state == 9 || state == 10)
1111             state = 3;
1112           else if (state != 3)
1113             state = 1;
1114 #endif
1115           PUT (ch);
1116           break;
1117
1118         case LEX_IS_NEWLINE:
1119           /* Roll out a bunch of newlines from inside comments, etc.  */
1120           if (add_newlines)
1121             {
1122               --add_newlines;
1123               UNGET (ch);
1124             }
1125           /* Fall through.  */
1126
1127         case LEX_IS_LINE_SEPARATOR:
1128           state = 0;
1129           PUT (ch);
1130           break;
1131
1132         case LEX_IS_PARALLEL_SEPARATOR:
1133           state = 1;
1134           PUT (ch);
1135           break;
1136
1137 #ifdef TC_V850
1138         case LEX_IS_DOUBLEDASH_1ST:
1139           ch2 = GET ();
1140           if (ch2 != '-')
1141             {
1142               if (ch2 != EOF)
1143                 UNGET (ch2);
1144               goto de_fault;
1145             }
1146           /* Read and skip to end of line.  */
1147           do
1148             {
1149               ch = GET ();
1150             }
1151           while (ch != EOF && ch != '\n');
1152
1153           if (ch == EOF)
1154             as_warn (_("end of file in comment; newline inserted"));
1155
1156           state = 0;
1157           PUT ('\n');
1158           break;
1159 #endif
1160 #ifdef DOUBLEBAR_PARALLEL
1161         case LEX_IS_DOUBLEBAR_1ST:
1162           ch2 = GET ();
1163           if (ch2 != EOF)
1164             UNGET (ch2);
1165           if (ch2 != '|')
1166             goto de_fault;
1167
1168           /* Handle '||' in two states as invoking PUT twice might
1169              result in the first one jumping out of this loop.  We'd
1170              then lose track of the state and one '|' char.  */
1171           state = 13;
1172           PUT ('|');
1173           break;
1174 #endif
1175         case LEX_IS_LINE_COMMENT_START:
1176           /* FIXME-someday: The two character comment stuff was badly
1177              thought out.  On i386, we want '/' as line comment start
1178              AND we want C style comments.  hence this hack.  The
1179              whole lexical process should be reworked.  xoxorich.  */
1180           if (ch == '/')
1181             {
1182               ch2 = GET ();
1183               if (ch2 == '*')
1184                 {
1185                   old_state = 3;
1186                   state = -2;
1187                   break;
1188                 }
1189               else
1190                 {
1191                   UNGET (ch2);
1192                 }
1193             }
1194
1195           if (state == 0 || state == 1) /* Only comment at start of line.  */
1196             {
1197               int startch;
1198
1199               startch = ch;
1200
1201               do
1202                 {
1203                   ch = GET ();
1204                 }
1205               while (ch != EOF && IS_WHITESPACE (ch));
1206
1207               if (ch == EOF)
1208                 {
1209                   as_warn (_("end of file in comment; newline inserted"));
1210                   PUT ('\n');
1211                   break;
1212                 }
1213
1214               if (ch < '0' || ch > '9' || state != 0 || startch != '#')
1215                 {
1216                   /* Not a cpp line.  */
1217                   while (ch != EOF && !IS_NEWLINE (ch))
1218                     ch = GET ();
1219                   if (ch == EOF)
1220                     as_warn (_("end of file in comment; newline inserted"));
1221                   state = 0;
1222                   PUT ('\n');
1223                   break;
1224                 }
1225               /* Looks like `# 123 "filename"' from cpp.  */
1226               UNGET (ch);
1227               old_state = 4;
1228               state = -1;
1229               if (scrub_m68k_mri)
1230                 out_string = "\tlinefile ";
1231               else
1232                 out_string = "\t.linefile ";
1233               PUT (*out_string++);
1234               break;
1235             }
1236
1237 #ifdef TC_D10V
1238           /* All insns end in a char for which LEX_IS_SYMBOL_COMPONENT is true.
1239              Trap is the only short insn that has a first operand that is
1240              neither register nor label.
1241              We must prevent exef0f ||trap #1 to degenerate to exef0f ||trap#1 .
1242              We can't make '#' LEX_IS_SYMBOL_COMPONENT because it is
1243              already LEX_IS_LINE_COMMENT_START.  However, it is the
1244              only character in line_comment_chars for d10v, hence we
1245              can recognize it as such.  */
1246           /* An alternative approach would be to reset the state to 1 when
1247              we see '||', '<'- or '->', but that seems to be overkill.  */
1248           if (state == 10)
1249             PUT (' ');
1250 #endif
1251           /* We have a line comment character which is not at the
1252              start of a line.  If this is also a normal comment
1253              character, fall through.  Otherwise treat it as a default
1254              character.  */
1255           if (strchr (tc_comment_chars, ch) == NULL
1256               && (! scrub_m68k_mri
1257                   || (ch != '!' && ch != '*')))
1258             goto de_fault;
1259           if (scrub_m68k_mri
1260               && (ch == '!' || ch == '*' || ch == '#')
1261               && state != 1
1262               && state != 10)
1263             goto de_fault;
1264           /* Fall through.  */
1265         case LEX_IS_COMMENT_START:
1266 #if defined TC_ARM && defined OBJ_ELF
1267           /* On the ARM, `@' is the comment character.
1268              Unfortunately this is also a special character in ELF .symver
1269              directives (and .type, though we deal with those another way).
1270              So we check if this line is such a directive, and treat
1271              the character as default if so.  This is a hack.  */
1272           if ((symver_state != NULL) && (*symver_state == 0))
1273             goto de_fault;
1274 #endif
1275
1276 #ifdef TC_ARM
1277           /* For the ARM, care is needed not to damage occurrences of \@
1278              by stripping the @ onwards.  Yuck.  */
1279           if (to > tostart && *(to - 1) == '\\')
1280             /* Do not treat the @ as a start-of-comment.  */
1281             goto de_fault;
1282 #endif
1283
1284 #ifdef WARN_COMMENTS
1285           if (!found_comment)
1286             as_where (&found_comment_file, &found_comment);
1287 #endif
1288           do
1289             {
1290               ch = GET ();
1291             }
1292           while (ch != EOF && !IS_NEWLINE (ch));
1293           if (ch == EOF)
1294             as_warn (_("end of file in comment; newline inserted"));
1295           state = 0;
1296           PUT ('\n');
1297           break;
1298
1299 #ifdef H_TICK_HEX
1300         case LEX_IS_H:
1301           /* Look for strings like H'[0-9A-Fa-f] and if found, replace
1302              the H' with 0x to make them gas-style hex characters.  */
1303           if (enable_h_tick_hex)
1304             {
1305               char quot;
1306
1307               quot = GET ();
1308               if (quot == '\'')
1309                 {
1310                   UNGET ('x');
1311                   ch = '0';
1312                 }
1313               else
1314                 UNGET (quot);
1315             }
1316           /* FALL THROUGH */
1317 #endif
1318
1319         case LEX_IS_SYMBOL_COMPONENT:
1320           if (state == 10)
1321             {
1322               /* This is a symbol character following another symbol
1323                  character, with whitespace in between.  We skipped
1324                  the whitespace earlier, so output it now.  */
1325               UNGET (ch);
1326               state = 3;
1327               PUT (' ');
1328               break;
1329             }
1330
1331 #ifdef TC_Z80
1332           /* "af'" is a symbol containing '\''.  */
1333           if (state == 3 && (ch == 'a' || ch == 'A'))
1334             {
1335               state = 16;
1336               PUT (ch);
1337               ch = GET ();
1338               if (ch == 'f' || ch == 'F')
1339                 {
1340                   state = 17;
1341                   PUT (ch);
1342                   break;
1343                 }
1344               else
1345                 {
1346                   state = 9;
1347                   if (ch == EOF || !IS_SYMBOL_COMPONENT (ch))
1348                     {
1349                       if (ch != EOF)
1350                         UNGET (ch);
1351                       break;
1352                     }
1353                 }
1354             }
1355 #endif
1356           if (state == 3)
1357             state = 9;
1358
1359           /* This is a common case.  Quickly copy CH and all the
1360              following symbol component or normal characters.  */
1361           if (to + 1 < toend
1362               && mri_state == NULL
1363 #if defined TC_ARM && defined OBJ_ELF
1364               && symver_state == NULL
1365 #endif
1366               )
1367             {
1368               char *s;
1369               ptrdiff_t len;
1370
1371               for (s = from; s < fromend; s++)
1372                 {
1373                   int type;
1374
1375                   ch2 = *(unsigned char *) s;
1376                   type = lex[ch2];
1377                   if (type != 0
1378                       && type != LEX_IS_SYMBOL_COMPONENT)
1379                     break;
1380                 }
1381
1382               if (s > from)
1383                 /* Handle the last character normally, for
1384                    simplicity.  */
1385                 --s;
1386
1387               len = s - from;
1388
1389               if (len > (toend - to) - 1)
1390                 len = (toend - to) - 1;
1391
1392               if (len > 0)
1393                 {
1394                   PUT (ch);
1395                   memcpy (to, from, len);
1396                   to += len;
1397                   from += len;
1398                   if (to >= toend)
1399                     goto tofull;
1400                   ch = GET ();
1401                 }
1402             }
1403
1404           /* Fall through.  */
1405         default:
1406         de_fault:
1407           /* Some relatively `normal' character.  */
1408           if (state == 0)
1409             {
1410               state = 11;       /* Now seeing label definition.  */
1411             }
1412           else if (state == 1)
1413             {
1414               state = 2;        /* Ditto.  */
1415             }
1416           else if (state == 9)
1417             {
1418               if (!IS_SYMBOL_COMPONENT (ch))
1419                 state = 3;
1420             }
1421           else if (state == 10)
1422             {
1423               if (ch == '\\')
1424                 {
1425                   /* Special handling for backslash: a backslash may
1426                      be the beginning of a formal parameter (of a
1427                      macro) following another symbol character, with
1428                      whitespace in between.  If that is the case, we
1429                      output a space before the parameter.  Strictly
1430                      speaking, correct handling depends upon what the
1431                      macro parameter expands into; if the parameter
1432                      expands into something which does not start with
1433                      an operand character, then we don't want to keep
1434                      the space.  We don't have enough information to
1435                      make the right choice, so here we are making the
1436                      choice which is more likely to be correct.  */
1437                   if (to + 1 >= toend)
1438                     {
1439                       /* If we're near the end of the buffer, save the
1440                          character for the next time round.  Otherwise
1441                          we'll lose our state.  */
1442                       UNGET (ch);
1443                       goto tofull;
1444                     }
1445                   *to++ = ' ';
1446                 }
1447
1448               state = 3;
1449             }
1450           PUT (ch);
1451           break;
1452         }
1453     }
1454
1455   /*NOTREACHED*/
1456
1457  fromeof:
1458   /* We have reached the end of the input.  */
1459   return to - tostart;
1460
1461  tofull:
1462   /* The output buffer is full.  Save any input we have not yet
1463      processed.  */
1464   if (fromend > from)
1465     {
1466       saved_input = from;
1467       saved_input_len = fromend - from;
1468     }
1469   else
1470     saved_input = NULL;
1471
1472   return to - tostart;
1473 }