Commit | Line | Data |
---|---|---|
be9485d5 SG |
1 | /* Prepare Tex index dribble output into an actual index. |
2 | Copyright (C) 1987 Free Software Foundation, Inc. | |
3 | ||
4 | This program is free software; you can redistribute it and/or modify | |
5 | it under the terms of the GNU General Public License as published by | |
6 | the Free Software Foundation; either version 1, or (at your option) | |
7 | any later version. | |
8 | ||
9 | This program is distributed in the hope that it will be useful, | |
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
12 | GNU General Public License for more details. | |
13 | ||
14 | You should have received a copy of the GNU General Public License | |
15 | along with this program; if not, write to the Free Software | |
16 | Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ | |
17 | ||
18 | #include <stdio.h> | |
19 | #include <ctype.h> | |
20 | #include <errno.h> | |
21 | extern int errno; | |
22 | ||
23 | #ifdef VMS | |
24 | #ifndef VAX11C | |
25 | #define noshare | |
26 | #endif | |
27 | ||
28 | #include <perror.h> | |
29 | #include <file.h> | |
30 | ||
31 | #define EXIT_SUCCESS ((1 << 28) | 1) | |
32 | #define EXIT_FATAL ((1 << 28) | 4) | |
33 | #define unlink delete | |
34 | #define tell(fd) lseek(fd, 0L, 1) | |
35 | ||
36 | #else /* Not VMS */ | |
37 | ||
38 | #ifdef USG | |
39 | #include <sys/types.h> | |
40 | #include <sys/fcntl.h> | |
41 | #endif | |
42 | #include <sys/file.h> | |
43 | ||
44 | #define EXIT_SUCCESS 0 | |
45 | #define EXIT_FATAL 1 | |
46 | ||
47 | #endif /* Not VMS */ | |
48 | ||
49 | ||
50 | #ifndef L_XTND | |
51 | #define L_XTND 2 | |
52 | #endif | |
53 | ||
54 | #ifdef VMS | |
55 | extern noshare int sys_nerr; | |
56 | extern noshare char *sys_errlist[]; | |
57 | #else | |
58 | extern int sys_nerr; | |
59 | extern char *sys_errlist[]; | |
60 | #endif | |
61 | ||
62 | /* When sorting in core, this structure describes one line | |
63 | and the position and length of its first keyfield. */ | |
64 | ||
65 | struct lineinfo | |
66 | { | |
67 | char *text; /* The actual text of the line */ | |
68 | union | |
69 | { /* The start of the key (for textual comparison) */ | |
70 | char *text; | |
71 | long number; /* or the numeric value (for numeric comparison) */ | |
72 | } key; | |
73 | long keylen; /* Length of key field */ | |
74 | }; | |
75 | ||
76 | /* This structure describes a field to use as a sort key */ | |
77 | ||
78 | struct keyfield | |
79 | { | |
80 | int startwords; /* # words to skip */ | |
81 | int startchars; /* and # additional chars to skip, to start of field */ | |
82 | int endwords; /* similar, from beg (or end) of line, to find end of field */ | |
83 | int endchars; | |
84 | char ignore_blanks; /* Ignore spaces and tabs within the field */ | |
85 | char fold_case; /* Convert upper case to lower before comparing */ | |
86 | char reverse; /* Compare in reverse order */ | |
87 | char numeric; /* Parse text as an integer and compare the integers */ | |
88 | char positional; /* Sort according to position within the file */ | |
89 | char braced; /* Count balanced-braced groupings as fields */ | |
90 | }; | |
91 | ||
92 | /* Vector of keyfields to use */ | |
93 | ||
94 | struct keyfield keyfields[3]; | |
95 | ||
96 | /* Number of keyfields stored in that vector. */ | |
97 | ||
98 | int num_keyfields = 3; | |
99 | ||
100 | /* Vector of input file names, terminated with a zero (null pointer) */ | |
101 | ||
102 | char **infiles; | |
103 | ||
104 | /* Vector of corresponding output file names, or zero meaning default it */ | |
105 | ||
106 | char **outfiles; | |
107 | ||
108 | /* Length of `infiles' */ | |
109 | ||
110 | int num_infiles; | |
111 | ||
112 | /* Pointer to the array of pointers to lines being sorted */ | |
113 | ||
114 | char **linearray; | |
115 | ||
116 | /* The allocated length of `linearray'. */ | |
117 | ||
118 | long nlines; | |
119 | ||
120 | /* Directory to use for temporary files. On Unix, it ends with a slash. */ | |
121 | ||
122 | char *tempdir; | |
123 | ||
124 | /* Start of filename to use for temporary files. */ | |
125 | ||
126 | char *tempbase; | |
127 | ||
128 | /* Number of last temporary file. */ | |
129 | ||
130 | int tempcount; | |
131 | ||
132 | /* Number of last temporary file already deleted. | |
133 | Temporary files are deleted by `flush_tempfiles' in order of creation. */ | |
134 | ||
135 | int last_deleted_tempcount; | |
136 | ||
137 | /* During in-core sort, this points to the base of the data block | |
138 | which contains all the lines of data. */ | |
139 | ||
140 | char *text_base; | |
141 | ||
142 | /* Additional command switches */ | |
143 | ||
144 | int keep_tempfiles; /* Nonzero means do not delete tempfiles -- for debugging */ | |
145 | ||
146 | /* Forward declarations of functions in this file */ | |
147 | ||
148 | void decode_command (); | |
149 | void sort_in_core (); | |
150 | void sort_offline (); | |
151 | char **parsefile (); | |
152 | char *find_field (); | |
153 | char *find_pos (); | |
154 | long find_value (); | |
155 | char *find_braced_pos (); | |
156 | char *find_braced_end (); | |
157 | void writelines (); | |
158 | int compare_full (); | |
159 | long readline (); | |
160 | int merge_files (); | |
161 | int merge_direct (); | |
162 | char *concat (); | |
163 | char *maketempname (); | |
164 | void flush_tempfiles (); | |
165 | char *tempcopy (); | |
166 | ||
167 | extern char *mktemp (); | |
168 | \f | |
169 | #define MAX_IN_CORE_SORT 500000 | |
170 | ||
171 | int | |
172 | main (argc, argv) | |
173 | int argc; | |
174 | char **argv; | |
175 | { | |
176 | int i; | |
177 | ||
178 | tempcount = 0; | |
179 | last_deleted_tempcount = 0; | |
180 | ||
181 | /* Describe the kind of sorting to do. */ | |
182 | /* The first keyfield uses the first braced field and folds case */ | |
183 | keyfields[0].braced = 1; | |
184 | keyfields[0].fold_case = 1; | |
185 | keyfields[0].endwords = -1; | |
186 | keyfields[0].endchars = -1; | |
187 | /* The second keyfield uses the second braced field, numerically */ | |
188 | keyfields[1].braced = 1; | |
189 | keyfields[1].numeric = 1; | |
190 | keyfields[1].startwords = 1; | |
191 | keyfields[1].endwords = -1; | |
192 | keyfields[1].endchars = -1; | |
193 | /* The third keyfield (which is ignored while discarding duplicates) | |
194 | compares the whole line */ | |
195 | keyfields[2].endwords = -1; | |
196 | keyfields[2].endchars = -1; | |
197 | ||
198 | decode_command (argc, argv); | |
199 | ||
200 | tempbase = mktemp (concat ("txiXXXXXX", "", "")); | |
201 | ||
202 | /* Process input files completely, one by one. */ | |
203 | ||
204 | for (i = 0; i < num_infiles; i++) | |
205 | { | |
206 | int desc; | |
207 | long ptr; | |
208 | char *outfile; | |
209 | char *p; | |
210 | ||
211 | desc = open (infiles[i], 0, 0); | |
212 | if (desc < 0) pfatal_with_name (infiles[i]); | |
213 | lseek (desc, 0, L_XTND); | |
214 | ptr = tell (desc); | |
215 | close (desc); | |
216 | ||
217 | outfile = outfiles[i]; | |
218 | if (!outfile) | |
219 | { | |
220 | outfile = concat (infiles[i], "s", ""); | |
221 | } | |
222 | ||
223 | if (ptr < MAX_IN_CORE_SORT) | |
224 | /* Sort a small amount of data */ | |
225 | sort_in_core (infiles[i], ptr, outfile); | |
226 | else | |
227 | sort_offline (infiles[i], ptr, outfile); | |
228 | } | |
229 | ||
230 | flush_tempfiles (tempcount); | |
231 | exit (EXIT_SUCCESS); | |
232 | } | |
233 | \f | |
234 | /* This page decodes the command line arguments to set the parameter variables | |
235 | and set up the vector of keyfields and the vector of input files */ | |
236 | ||
237 | void | |
238 | decode_command (argc, argv) | |
239 | int argc; | |
240 | char **argv; | |
241 | { | |
242 | int i; | |
243 | char **ip; | |
244 | char **op; | |
245 | ||
246 | /* Store default values into parameter variables */ | |
247 | ||
248 | #ifdef VMS | |
249 | tempdir = "sys$scratch:"; | |
250 | #else | |
251 | tempdir = "/tmp/"; | |
252 | #endif | |
253 | ||
254 | keep_tempfiles = 0; | |
255 | ||
256 | /* Allocate argc input files, which must be enough. */ | |
257 | ||
258 | infiles = (char **) xmalloc (argc * sizeof (char *)); | |
259 | outfiles = (char **) xmalloc (argc * sizeof (char *)); | |
260 | ip = infiles; | |
261 | op = outfiles; | |
262 | ||
263 | /* First find all switches that control the default kind-of-sort */ | |
264 | ||
265 | for (i = 1; i < argc; i++) | |
266 | { | |
267 | int tem = classify_arg (argv[i]); | |
268 | char c; | |
269 | char *p; | |
270 | ||
271 | if (tem <= 0) | |
272 | { | |
273 | *ip++ = argv[i]; | |
274 | *op++ = 0; | |
275 | continue; | |
276 | } | |
277 | if (tem > 1) | |
278 | { | |
279 | if (i + 1 == argc) | |
280 | fatal ("switch %s given with no argument following it", argv[i]); | |
281 | else if (!strcmp (argv[i], "-T")) | |
282 | tempdir = argv[i + 1]; | |
283 | else if (!strcmp (argv[i], "-o")) | |
284 | *(op - 1) = argv[i + 1]; | |
285 | i += tem - 1; | |
286 | continue; | |
287 | } | |
288 | ||
289 | p = &argv[i][1]; | |
290 | while (c = *p++) | |
291 | switch (c) | |
292 | { | |
293 | case 'k': | |
294 | keep_tempfiles = 1; | |
295 | break; | |
296 | ||
297 | default: | |
298 | fatal ("invalid command switch %c", c); | |
299 | } | |
300 | switchdone: ; | |
301 | } | |
302 | ||
303 | /* Record number of keyfields, terminate list of filenames */ | |
304 | ||
305 | num_infiles = ip - infiles; | |
306 | *ip = 0; | |
307 | } | |
308 | ||
309 | /* Return 0 for an argument that is not a switch; | |
310 | for a switch, return 1 plus the number of following arguments that the switch swallows. | |
311 | */ | |
312 | ||
313 | int | |
314 | classify_arg (arg) | |
315 | char *arg; | |
316 | { | |
317 | if (!strcmp (arg, "-T") || !strcmp (arg, "-o")) | |
318 | return 2; | |
319 | if (arg[0] == '-') | |
320 | return 1; | |
321 | return 0; | |
322 | } | |
323 | \f | |
324 | /* Create a name for a temporary file */ | |
325 | ||
326 | char * | |
327 | maketempname (count) | |
328 | int count; | |
329 | { | |
330 | char tempsuffix[10]; | |
331 | sprintf (tempsuffix, "%d", count); | |
332 | return concat (tempdir, tempbase, tempsuffix); | |
333 | } | |
334 | ||
335 | /* Delete all temporary files up to the specified count */ | |
336 | ||
337 | void | |
338 | flush_tempfiles (to_count) | |
339 | int to_count; | |
340 | { | |
341 | if (keep_tempfiles) return; | |
342 | while (last_deleted_tempcount < to_count) | |
343 | unlink (maketempname (++last_deleted_tempcount)); | |
344 | } | |
345 | ||
346 | /* Copy an input file into a temporary file, and return the temporary file name */ | |
347 | ||
348 | #define BUFSIZE 1024 | |
349 | ||
350 | char * | |
351 | tempcopy (idesc) | |
352 | int idesc; | |
353 | { | |
354 | char *outfile = maketempname (++tempcount); | |
355 | int odesc; | |
356 | char buffer[BUFSIZE]; | |
357 | ||
358 | odesc = open (outfile, O_WRONLY | O_CREAT, 0666); | |
359 | ||
360 | if (odesc < 0) pfatal_with_name (outfile); | |
361 | ||
362 | while (1) | |
363 | { | |
364 | int nread = read (idesc, buffer, BUFSIZE); | |
365 | write (odesc, buffer, nread); | |
366 | if (!nread) break; | |
367 | } | |
368 | ||
369 | close (odesc); | |
370 | ||
371 | return outfile; | |
372 | } | |
373 | \f | |
374 | /* Compare two lines, provided as pointers to pointers to text, | |
375 | according to the specified set of keyfields */ | |
376 | ||
377 | int | |
378 | compare_full (line1, line2) | |
379 | char **line1, **line2; | |
380 | { | |
381 | int i; | |
382 | ||
383 | /* Compare using the first keyfield; | |
384 | if that does not distinguish the lines, try the second keyfield; and so on. */ | |
385 | ||
386 | for (i = 0; i < num_keyfields; i++) | |
387 | { | |
388 | long length1, length2; | |
389 | char *start1 = find_field (&keyfields[i], *line1, &length1); | |
390 | char *start2 = find_field (&keyfields[i], *line2, &length2); | |
391 | int tem = compare_field (&keyfields[i], start1, length1, *line1 - text_base, | |
392 | start2, length2, *line2 - text_base); | |
393 | if (tem) | |
394 | { | |
395 | if (keyfields[i].reverse) | |
396 | return - tem; | |
397 | return tem; | |
398 | } | |
399 | } | |
400 | ||
401 | return 0; /* Lines match exactly */ | |
402 | } | |
403 | ||
404 | /* Compare two lines described by structures | |
405 | in which the first keyfield is identified in advance. | |
406 | For positional sorting, assumes that the order of the lines in core | |
407 | reflects their nominal order. */ | |
408 | ||
409 | int | |
410 | compare_prepared (line1, line2) | |
411 | struct lineinfo *line1, *line2; | |
412 | { | |
413 | int i; | |
414 | int tem; | |
415 | char *text1, *text2; | |
416 | ||
417 | /* Compare using the first keyfield, which has been found for us already */ | |
418 | if (keyfields->positional) | |
419 | { | |
420 | if (line1->text - text_base > line2->text - text_base) | |
421 | tem = 1; | |
422 | else | |
423 | tem = -1; | |
424 | } | |
425 | else if (keyfields->numeric) | |
426 | tem = line1->key.number - line2->key.number; | |
427 | else | |
428 | tem = compare_field (keyfields, line1->key.text, line1->keylen, 0, line2->key.text, line2->keylen, 0); | |
429 | if (tem) | |
430 | { | |
431 | if (keyfields->reverse) | |
432 | return - tem; | |
433 | return tem; | |
434 | } | |
435 | ||
436 | text1 = line1->text; | |
437 | text2 = line2->text; | |
438 | ||
439 | /* Compare using the second keyfield; | |
440 | if that does not distinguish the lines, try the third keyfield; and so on. */ | |
441 | ||
442 | for (i = 1; i < num_keyfields; i++) | |
443 | { | |
444 | long length1, length2; | |
445 | char *start1 = find_field (&keyfields[i], text1, &length1); | |
446 | char *start2 = find_field (&keyfields[i], text2, &length2); | |
447 | int tem = compare_field (&keyfields[i], start1, length1, text1 - text_base, | |
448 | start2, length2, text2 - text_base); | |
449 | if (tem) | |
450 | { | |
451 | if (keyfields[i].reverse) | |
452 | return - tem; | |
453 | return tem; | |
454 | } | |
455 | } | |
456 | ||
457 | return 0; /* Lines match exactly */ | |
458 | } | |
459 | ||
460 | /* Like compare_full but more general. | |
461 | You can pass any strings, and you can say how many keyfields to use. | |
462 | `pos1' and `pos2' should indicate the nominal positional ordering of | |
463 | the two lines in the input. */ | |
464 | ||
465 | int | |
466 | compare_general (str1, str2, pos1, pos2, use_keyfields) | |
467 | char *str1, *str2; | |
468 | long pos1, pos2; | |
469 | int use_keyfields; | |
470 | { | |
471 | int i; | |
472 | ||
473 | /* Compare using the first keyfield; | |
474 | if that does not distinguish the lines, try the second keyfield; and so on. */ | |
475 | ||
476 | for (i = 0; i < use_keyfields; i++) | |
477 | { | |
478 | long length1, length2; | |
479 | char *start1 = find_field (&keyfields[i], str1, &length1); | |
480 | char *start2 = find_field (&keyfields[i], str2, &length2); | |
481 | int tem = compare_field (&keyfields[i], start1, length1, pos1, start2, length2, pos2); | |
482 | if (tem) | |
483 | { | |
484 | if (keyfields[i].reverse) | |
485 | return - tem; | |
486 | return tem; | |
487 | } | |
488 | } | |
489 | ||
490 | return 0; /* Lines match exactly */ | |
491 | } | |
492 | ||
493 | /* Find the start and length of a field in `str' according to `keyfield'. | |
494 | A pointer to the starting character is returned, and the length | |
495 | is stored into the int that `lengthptr' points to. */ | |
496 | ||
497 | char * | |
498 | find_field (keyfield, str, lengthptr) | |
499 | struct keyfield *keyfield; | |
500 | char *str; | |
501 | long *lengthptr; | |
502 | { | |
503 | char *start; | |
504 | char *end; | |
505 | char *(*fun) (); | |
506 | ||
507 | if (keyfield->braced) fun = find_braced_pos; | |
508 | else fun = find_pos; | |
509 | ||
510 | start = ( *fun )(str, keyfield->startwords, keyfield->startchars, | |
511 | keyfield->ignore_blanks); | |
512 | if (keyfield->endwords < 0) | |
513 | { | |
514 | if (keyfield->braced) | |
515 | end = find_braced_end (start); | |
516 | else | |
517 | { | |
518 | end = start; | |
519 | while (*end && *end != '\n') end++; | |
520 | } | |
521 | } | |
522 | else | |
523 | { | |
524 | end = ( *fun )(str, keyfield->endwords, keyfield->endchars, 0); | |
525 | if (end - str < start - str) end = start; | |
526 | } | |
527 | *lengthptr = end - start; | |
528 | return start; | |
529 | } | |
530 | ||
531 | /* Find a pointer to a specified place within `str', | |
532 | skipping (from the beginning) `words' words and then `chars' chars. | |
533 | If `ignore_blanks' is nonzero, we skip all blanks | |
534 | after finding the specified word. */ | |
535 | ||
536 | char * | |
537 | find_pos (str, words, chars, ignore_blanks) | |
538 | char *str; | |
539 | int words, chars; | |
540 | int ignore_blanks; | |
541 | { | |
542 | int i; | |
543 | char *p = str; | |
544 | ||
545 | for (i = 0; i < words; i++) | |
546 | { | |
547 | char c; | |
548 | /* Find next bunch of nonblanks and skip them. */ | |
549 | while ((c = *p) == ' ' || c == '\t') p++; | |
550 | while ((c = *p) && c != '\n' && !(c == ' ' || c == '\t')) p++; | |
551 | if (!*p || *p == '\n') return p; | |
552 | } | |
553 | ||
554 | while (*p == ' ' || *p == '\t') p++; | |
555 | ||
556 | for (i = 0; i < chars; i++) | |
557 | { | |
558 | if (!*p || *p == '\n') break; | |
559 | p++; | |
560 | } | |
561 | return p; | |
562 | } | |
563 | ||
564 | /* Like find_pos but assumes that each field is surrounded by braces | |
565 | and that braces within fields are balanced. */ | |
566 | ||
567 | char * | |
568 | find_braced_pos (str, words, chars, ignore_blanks) | |
569 | char *str; | |
570 | int words, chars; | |
571 | int ignore_blanks; | |
572 | { | |
573 | int i; | |
574 | int bracelevel; | |
575 | char *p = str; | |
576 | char c; | |
577 | ||
578 | for (i = 0; i < words; i++) | |
579 | { | |
580 | bracelevel = 1; | |
581 | while ((c = *p++) != '{' && c != '\n' && c); | |
582 | if (c != '{') | |
583 | return p - 1; | |
584 | while (bracelevel) | |
585 | { | |
586 | c = *p++; | |
587 | if (c == '{') bracelevel++; | |
588 | if (c == '}') bracelevel--; | |
589 | #if 0 | |
590 | if (c == '\\' || c == '@') c = *p++; /* \ quotes braces and \ */ | |
591 | #endif | |
592 | if (c == 0 || c == '\n') return p-1; | |
593 | } | |
594 | } | |
595 | ||
596 | while ((c = *p++) != '{' && c != '\n' && c); | |
597 | ||
598 | if (c != '{') | |
599 | return p-1; | |
600 | ||
601 | if (ignore_blanks) | |
602 | while ((c = *p) == ' ' || c == '\t') p++; | |
603 | ||
604 | for (i = 0; i < chars; i++) | |
605 | { | |
606 | if (!*p || *p == '\n') break; | |
607 | p++; | |
608 | } | |
609 | return p; | |
610 | } | |
611 | ||
612 | /* Find the end of the balanced-brace field which starts at `str'. | |
613 | The position returned is just before the closing brace. */ | |
614 | ||
615 | char * | |
616 | find_braced_end (str) | |
617 | char *str; | |
618 | { | |
619 | int bracelevel; | |
620 | char *p = str; | |
621 | char c; | |
622 | ||
623 | bracelevel = 1; | |
624 | while (bracelevel) | |
625 | { | |
626 | c = *p++; | |
627 | if (c == '{') bracelevel++; | |
628 | if (c == '}') bracelevel--; | |
629 | #if 0 | |
630 | if (c == '\\' || c == '@') c = *p++; | |
631 | #endif | |
632 | if (c == 0 || c == '\n') return p-1; | |
633 | } | |
634 | return p - 1; | |
635 | } | |
636 | ||
637 | long | |
638 | find_value (start, length) | |
639 | char *start; | |
640 | long length; | |
641 | { | |
642 | while (length != 0L) { | |
643 | if (isdigit(*start)) | |
644 | return atol(start); | |
645 | length--; | |
646 | start++; | |
647 | } | |
648 | return 0l; | |
649 | } | |
650 | ||
651 | /* Vector used to translate characters for comparison. | |
652 | This is how we make all alphanumerics follow all else, | |
653 | and ignore case in the first sorting. */ | |
654 | int char_order[256]; | |
655 | ||
656 | init_char_order () | |
657 | { | |
658 | int i; | |
659 | for (i = 1; i < 256; i++) | |
660 | char_order[i] = i; | |
661 | ||
662 | for (i = '0'; i <= '9'; i++) | |
663 | char_order[i] += 512; | |
664 | ||
665 | for (i = 'a'; i <= 'z'; i++) { | |
666 | char_order[i] = 512 + i; | |
667 | char_order[i + 'A' - 'a'] = 512 + i; | |
668 | } | |
669 | } | |
670 | ||
671 | /* Compare two fields (each specified as a start pointer and a character count) | |
672 | according to `keyfield'. The sign of the value reports the relation between the fields */ | |
673 | ||
674 | int | |
675 | compare_field (keyfield, start1, length1, pos1, start2, length2, pos2) | |
676 | struct keyfield *keyfield; | |
677 | char *start1; | |
678 | long length1; | |
679 | long pos1; | |
680 | char *start2; | |
681 | long length2; | |
682 | long pos2; | |
683 | { | |
684 | if (keyfields->positional) | |
685 | { | |
686 | if (pos1 > pos2) | |
687 | return 1; | |
688 | else | |
689 | return -1; | |
690 | } | |
691 | if (keyfield->numeric) | |
692 | { | |
693 | long value = find_value (start1, length1) - find_value (start2, length2); | |
694 | if (value > 0) return 1; | |
695 | if (value < 0) return -1; | |
696 | return 0; | |
697 | } | |
698 | else | |
699 | { | |
700 | char *p1 = start1; | |
701 | char *p2 = start2; | |
702 | char *e1 = start1 + length1; | |
703 | char *e2 = start2 + length2; | |
704 | ||
705 | int fold_case = keyfield->fold_case; | |
706 | ||
707 | while (1) | |
708 | { | |
709 | int c1, c2; | |
710 | ||
711 | if (p1 == e1) c1 = 0; | |
712 | else c1 = *p1++; | |
713 | if (p2 == e2) c2 = 0; | |
714 | else c2 = *p2++; | |
715 | ||
716 | if (char_order[c1] != char_order[c2]) | |
717 | return char_order[c1] - char_order[c2]; | |
718 | if (!c1) break; | |
719 | } | |
720 | ||
721 | /* Strings are equal except possibly for case. */ | |
722 | p1 = start1; | |
723 | p2 = start2; | |
724 | while (1) | |
725 | { | |
726 | int c1, c2; | |
727 | ||
728 | if (p1 == e1) c1 = 0; | |
729 | else c1 = *p1++; | |
730 | if (p2 == e2) c2 = 0; | |
731 | else c2 = *p2++; | |
732 | ||
733 | if (c1 != c2) | |
734 | /* Reverse sign here so upper case comes out last. */ | |
735 | return c2 - c1; | |
736 | if (!c1) break; | |
737 | } | |
738 | ||
739 | return 0; | |
740 | } | |
741 | } | |
742 | \f | |
743 | /* A `struct linebuffer' is a structure which holds a line of text. | |
744 | `readline' reads a line from a stream into a linebuffer | |
745 | and works regardless of the length of the line. */ | |
746 | ||
747 | struct linebuffer | |
748 | { | |
749 | long size; | |
750 | char *buffer; | |
751 | }; | |
752 | ||
753 | /* Initialize a linebuffer for use */ | |
754 | ||
755 | void | |
756 | initbuffer (linebuffer) | |
757 | struct linebuffer *linebuffer; | |
758 | { | |
759 | linebuffer->size = 200; | |
760 | linebuffer->buffer = (char *) xmalloc (200); | |
761 | } | |
762 | ||
763 | /* Read a line of text from `stream' into `linebuffer'. | |
764 | Return the length of the line. */ | |
765 | ||
766 | long | |
767 | readline (linebuffer, stream) | |
768 | struct linebuffer *linebuffer; | |
769 | FILE *stream; | |
770 | { | |
771 | char *buffer = linebuffer->buffer; | |
772 | char *p = linebuffer->buffer; | |
773 | char *end = p + linebuffer->size; | |
774 | ||
775 | while (1) | |
776 | { | |
777 | int c = getc (stream); | |
778 | if (p == end) | |
779 | { | |
780 | buffer = (char *) xrealloc (buffer, linebuffer->size *= 2); | |
781 | p += buffer - linebuffer->buffer; | |
782 | end += buffer - linebuffer->buffer; | |
783 | linebuffer->buffer = buffer; | |
784 | } | |
785 | if (c < 0 || c == '\n') | |
786 | { | |
787 | *p = 0; | |
788 | break; | |
789 | } | |
790 | *p++ = c; | |
791 | } | |
792 | ||
793 | return p - buffer; | |
794 | } | |
795 | \f | |
796 | /* Sort an input file too big to sort in core. */ | |
797 | ||
798 | void | |
799 | sort_offline (infile, nfiles, total, outfile) | |
800 | char *infile; | |
801 | long total; | |
802 | char *outfile; | |
803 | { | |
804 | int ntemps = 2 * (total + MAX_IN_CORE_SORT - 1) / MAX_IN_CORE_SORT; /* More than enough */ | |
805 | char **tempfiles = (char **) xmalloc (ntemps * sizeof (char *)); | |
806 | FILE *istream = fopen (infile, "r"); | |
807 | int i; | |
808 | struct linebuffer lb; | |
809 | long linelength; | |
810 | int failure = 0; | |
811 | ||
812 | initbuffer (&lb); | |
813 | ||
814 | /* Read in one line of input data. */ | |
815 | ||
816 | linelength = readline (&lb, istream); | |
817 | ||
818 | if (lb.buffer[0] != '\\' && lb.buffer[0] != '@') | |
819 | { | |
820 | error ("%s: not a texinfo index file", infile); | |
821 | return; | |
822 | } | |
823 | ||
824 | /* Split up the input into `ntemps' temporary files, or maybe fewer, | |
825 | and put the new files' names into `tempfiles' */ | |
826 | ||
827 | for (i = 0; i < ntemps; i++) | |
828 | { | |
829 | char *outname = maketempname (++tempcount); | |
830 | FILE *ostream = fopen (outname, "w"); | |
831 | long tempsize = 0; | |
832 | ||
833 | if (!ostream) pfatal_with_name (outname); | |
834 | tempfiles[i] = outname; | |
835 | ||
836 | /* Copy lines into this temp file as long as it does not make file "too big" | |
837 | or until there are no more lines. */ | |
838 | ||
839 | while (tempsize + linelength + 1 <= MAX_IN_CORE_SORT) | |
840 | { | |
841 | tempsize += linelength + 1; | |
842 | fputs (lb.buffer, ostream); | |
843 | putc ('\n', ostream); | |
844 | ||
845 | /* Read another line of input data. */ | |
846 | ||
847 | linelength = readline (&lb, istream); | |
848 | if (!linelength && feof (istream)) break; | |
849 | ||
850 | if (lb.buffer[0] != '\\' && lb.buffer[0] != '@') | |
851 | { | |
852 | error ("%s: not a texinfo index file", infile); | |
853 | failure = 1; | |
854 | goto fail; | |
855 | } | |
856 | } | |
857 | fclose (ostream); | |
858 | if (feof (istream)) break; | |
859 | } | |
860 | ||
861 | free (lb.buffer); | |
862 | ||
863 | fail: | |
864 | /* Record number of temp files we actually needed. */ | |
865 | ||
866 | ntemps = i; | |
867 | ||
868 | /* Sort each tempfile into another tempfile. | |
869 | Delete the first set of tempfiles and put the names of the second into `tempfiles' */ | |
870 | ||
871 | for (i = 0; i < ntemps; i++) | |
872 | { | |
873 | char *newtemp = maketempname (++tempcount); | |
874 | sort_in_core (&tempfiles[i], MAX_IN_CORE_SORT, newtemp); | |
875 | if (!keep_tempfiles) | |
876 | unlink (tempfiles[i]); | |
877 | tempfiles[i] = newtemp; | |
878 | } | |
879 | ||
880 | if (failure) | |
881 | return; | |
882 | ||
883 | /* Merge the tempfiles together and indexify */ | |
884 | ||
885 | merge_files (tempfiles, ntemps, outfile); | |
886 | } | |
887 | \f | |
888 | /* Sort `infile', whose size is `total', | |
889 | assuming that is small enough to be done in-core, | |
890 | then indexify it and send the output to `outfile' (or to stdout). */ | |
891 | ||
892 | void | |
893 | sort_in_core (infile, total, outfile) | |
894 | char *infile; | |
895 | long total; | |
896 | char *outfile; | |
897 | { | |
898 | char **nextline; | |
899 | char *data = (char *) xmalloc (total + 1); | |
900 | char *file_data; | |
901 | long file_size; | |
902 | int i; | |
903 | FILE *ostream = stdout; | |
904 | struct lineinfo *lineinfo; | |
905 | ||
906 | /* Read the contents of the file into the moby array `data' */ | |
907 | ||
908 | int desc = open (infile, 0, 0); | |
909 | ||
910 | if (desc < 0) | |
911 | fatal ("failure reopening %s", infile); | |
912 | for (file_size = 0; ; ) | |
913 | { | |
914 | if ((i = read (desc, data + file_size, total - file_size)) <= 0) | |
915 | break; | |
916 | file_size += i; | |
917 | } | |
918 | file_data = data; | |
919 | data[file_size] = 0; | |
920 | ||
921 | close (desc); | |
922 | ||
923 | if (file_size > 0 && data[0] != '\\' && data[0] != '@') | |
924 | { | |
925 | error ("%s: not a texinfo index file", infile); | |
926 | return; | |
927 | } | |
928 | ||
929 | init_char_order (); | |
930 | ||
931 | /* Sort routines want to know this address */ | |
932 | ||
933 | text_base = data; | |
934 | ||
935 | /* Create the array of pointers to lines, with a default size frequently enough. */ | |
936 | ||
937 | nlines = total / 50; | |
938 | if (!nlines) nlines = 2; | |
939 | linearray = (char **) xmalloc (nlines * sizeof (char *)); | |
940 | ||
941 | /* `nextline' points to the next free slot in this array. | |
942 | `nlines' is the allocated size. */ | |
943 | ||
944 | nextline = linearray; | |
945 | ||
946 | /* Parse the input file's data, and make entries for the lines. */ | |
947 | ||
948 | nextline = parsefile (infile, nextline, file_data, file_size); | |
949 | if (nextline == 0) | |
950 | { | |
951 | error ("%s: not a texinfo index file", infile); | |
952 | return; | |
953 | } | |
954 | ||
955 | /* Sort the lines */ | |
956 | ||
957 | /* If we have enough space, find the first keyfield of each line in advance. | |
958 | Make a `struct lineinfo' for each line, which records the keyfield | |
959 | as well as the line, and sort them. */ | |
960 | ||
961 | lineinfo = (struct lineinfo *) malloc ((nextline - linearray) * sizeof (struct lineinfo)); | |
962 | ||
963 | if (lineinfo) | |
964 | { | |
965 | struct lineinfo *lp; | |
966 | char **p; | |
967 | ||
968 | for (lp = lineinfo, p = linearray; p != nextline; lp++, p++) | |
969 | { | |
970 | lp->text = *p; | |
971 | lp->key.text = find_field (keyfields, *p, &lp->keylen); | |
972 | if (keyfields->numeric) | |
973 | lp->key.number = find_value (lp->key.text, lp->keylen); | |
974 | } | |
975 | ||
976 | qsort (lineinfo, nextline - linearray, sizeof (struct lineinfo), compare_prepared); | |
977 | ||
978 | for (lp = lineinfo, p = linearray; p != nextline; lp++, p++) | |
979 | *p = lp->text; | |
980 | ||
981 | free (lineinfo); | |
982 | } | |
983 | else | |
984 | qsort (linearray, nextline - linearray, sizeof (char *), compare_full); | |
985 | ||
986 | /* Open the output file */ | |
987 | ||
988 | if (outfile) | |
989 | { | |
990 | ostream = fopen (outfile, "w"); | |
991 | if (!ostream) | |
992 | pfatal_with_name (outfile); | |
993 | } | |
994 | ||
995 | writelines (linearray, nextline - linearray, ostream); | |
996 | if (outfile) fclose (ostream); | |
997 | ||
998 | free (linearray); | |
999 | free (data); | |
1000 | } | |
1001 | \f | |
1002 | /* Parse an input string in core into lines. | |
1003 | DATA is the input string, and SIZE is its length. | |
1004 | Data goes in LINEARRAY starting at NEXTLINE. | |
1005 | The value returned is the first entry in LINEARRAY still unused. | |
1006 | Value 0 means input file contents are invalid. */ | |
1007 | ||
1008 | char ** | |
1009 | parsefile (filename, nextline, data, size) | |
1010 | char *filename; | |
1011 | char **nextline; | |
1012 | char *data; | |
1013 | long size; | |
1014 | { | |
1015 | char *p, *end; | |
1016 | char **line = nextline; | |
1017 | ||
1018 | p = data; | |
1019 | end = p + size; | |
1020 | *end = 0; | |
1021 | ||
1022 | while (p != end) | |
1023 | { | |
1024 | if (p[0] != '\\' && p[0] != '@') | |
1025 | return 0; | |
1026 | ||
1027 | *line = p; | |
1028 | while (*p && *p != '\n') p++; | |
1029 | if (p != end) p++; | |
1030 | ||
1031 | line++; | |
1032 | if (line == linearray + nlines) | |
1033 | { | |
1034 | char **old = linearray; | |
1035 | linearray = (char **) xrealloc (linearray, sizeof (char *) * (nlines *= 4)); | |
1036 | line += linearray - old; | |
1037 | } | |
1038 | } | |
1039 | ||
1040 | return line; | |
1041 | } | |
1042 | \f | |
1043 | /* Indexification is a filter applied to the sorted lines | |
1044 | as they are being written to the output file. | |
1045 | Multiple entries for the same name, with different page numbers, | |
1046 | get combined into a single entry with multiple page numbers. | |
1047 | The first braced field, which is used for sorting, is discarded. | |
1048 | However, its first character is examined, folded to lower case, | |
1049 | and if it is different from that in the previous line fed to us | |
1050 | a \initial line is written with one argument, the new initial. | |
1051 | ||
1052 | If an entry has four braced fields, then the second and third | |
1053 | constitute primary and secondary names. | |
1054 | In this case, each change of primary name | |
1055 | generates a \primary line which contains only the primary name, | |
1056 | and in between these are \secondary lines which contain | |
1057 | just a secondary name and page numbers. | |
1058 | */ | |
1059 | ||
1060 | /* The last primary name we wrote a \primary entry for. | |
1061 | If only one level of indexing is being done, this is the last name seen */ | |
1062 | char *lastprimary; | |
1063 | int lastprimarylength; /* Length of storage allocated for lastprimary */ | |
1064 | ||
1065 | /* Similar, for the secondary name. */ | |
1066 | char *lastsecondary; | |
1067 | int lastsecondarylength; | |
1068 | ||
1069 | /* Zero if we are not in the middle of writing an entry. | |
1070 | One if we have written the beginning of an entry but have not | |
1071 | yet written any page numbers into it. | |
1072 | Greater than one if we have written the beginning of an entry | |
1073 | plus at least one page number. */ | |
1074 | int pending; | |
1075 | ||
1076 | /* The initial (for sorting purposes) of the last primary entry written. | |
1077 | When this changes, a \initial {c} line is written */ | |
1078 | ||
1079 | char * lastinitial; | |
1080 | ||
1081 | int lastinitiallength; | |
1082 | ||
1083 | /* When we need a string of length 1 for the value of lastinitial, | |
1084 | store it here. */ | |
1085 | ||
1086 | char lastinitial1[2]; | |
1087 | ||
1088 | /* Initialize static storage for writing an index */ | |
1089 | ||
1090 | void | |
1091 | init_index () | |
1092 | { | |
1093 | pending = 0; | |
1094 | lastinitial = lastinitial1; | |
1095 | lastinitial1[0] = 0; | |
1096 | lastinitial1[1] = 0; | |
1097 | lastinitiallength = 0; | |
1098 | lastprimarylength = 100; | |
1099 | lastprimary = (char *) xmalloc (lastprimarylength + 1); | |
1100 | bzero (lastprimary, lastprimarylength + 1); | |
1101 | lastsecondarylength = 100; | |
1102 | lastsecondary = (char *) xmalloc (lastsecondarylength + 1); | |
1103 | bzero (lastsecondary, lastsecondarylength + 1); | |
1104 | } | |
1105 | ||
1106 | /* Indexify. Merge entries for the same name, | |
1107 | insert headers for each initial character, etc. */ | |
1108 | ||
1109 | indexify (line, ostream) | |
1110 | char *line; | |
1111 | FILE *ostream; | |
1112 | { | |
1113 | char *primary, *secondary, *pagenumber; | |
1114 | int primarylength, secondarylength, pagelength; | |
1115 | int len = strlen (line); | |
1116 | int nosecondary; | |
1117 | int initiallength; | |
1118 | char *initial; | |
1119 | char initial1[2]; | |
1120 | register char *p; | |
1121 | ||
1122 | /* First, analyze the parts of the entry fed to us this time */ | |
1123 | ||
1124 | p = find_braced_pos (line, 0, 0, 0); | |
1125 | if (*p == '{') | |
1126 | { | |
1127 | initial = p; | |
1128 | /* Get length of inner pair of braces starting at p, | |
1129 | including that inner pair of braces. */ | |
1130 | initiallength = find_braced_end (p + 1) + 1 - p; | |
1131 | } | |
1132 | else | |
1133 | { | |
1134 | initial = initial1; | |
1135 | initial1[0] = *p; | |
1136 | initial1[1] = 0; | |
1137 | initiallength = 1; | |
1138 | ||
1139 | if (initial1[0] >= 'a' && initial1[0] <= 'z') | |
1140 | initial1[0] -= 040; | |
1141 | } | |
1142 | ||
1143 | pagenumber = find_braced_pos (line, 1, 0, 0); | |
1144 | pagelength = find_braced_end (pagenumber) - pagenumber; | |
1145 | if (pagelength == 0) | |
1146 | abort (); | |
1147 | ||
1148 | primary = find_braced_pos (line, 2, 0, 0); | |
1149 | primarylength = find_braced_end (primary) - primary; | |
1150 | ||
1151 | secondary = find_braced_pos (line, 3, 0, 0); | |
1152 | nosecondary = !*secondary; | |
1153 | if (!nosecondary) | |
1154 | secondarylength = find_braced_end (secondary) - secondary; | |
1155 | ||
1156 | /* If the primary is different from before, make a new primary entry */ | |
1157 | if (strncmp (primary, lastprimary, primarylength)) | |
1158 | { | |
1159 | /* Close off current secondary entry first, if one is open */ | |
1160 | if (pending) | |
1161 | { | |
1162 | fputs ("}\n", ostream); | |
1163 | pending = 0; | |
1164 | } | |
1165 | ||
1166 | /* If this primary has a different initial, include an entry for the initial */ | |
1167 | if (initiallength != lastinitiallength || | |
1168 | strncmp (initial, lastinitial, initiallength)) | |
1169 | { | |
1170 | fprintf (ostream, "\\initial {"); | |
1171 | fwrite (initial, 1, initiallength, ostream); | |
1172 | fprintf (ostream, "}\n", initial); | |
1173 | if (initial == initial1) | |
1174 | { | |
1175 | lastinitial = lastinitial1; | |
1176 | *lastinitial1 = *initial1; | |
1177 | } | |
1178 | else | |
1179 | { | |
1180 | lastinitial = initial; | |
1181 | } | |
1182 | lastinitiallength = initiallength; | |
1183 | } | |
1184 | ||
1185 | /* Make the entry for the primary. */ | |
1186 | if (nosecondary) | |
1187 | fputs ("\\entry {", ostream); | |
1188 | else | |
1189 | fputs ("\\primary {", ostream); | |
1190 | fwrite (primary, primarylength, 1, ostream); | |
1191 | if (nosecondary) | |
1192 | { | |
1193 | fputs ("}{", ostream); | |
1194 | pending = 1; | |
1195 | } | |
1196 | else | |
1197 | fputs ("}\n", ostream); | |
1198 | ||
1199 | /* Record name of most recent primary */ | |
1200 | if (lastprimarylength < primarylength) | |
1201 | { | |
1202 | lastprimarylength = primarylength + 100; | |
1203 | lastprimary = (char *) xrealloc (lastprimary, | |
1204 | 1 + lastprimarylength); | |
1205 | } | |
1206 | strncpy (lastprimary, primary, primarylength); | |
1207 | lastprimary[primarylength] = 0; | |
1208 | ||
1209 | /* There is no current secondary within this primary, now */ | |
1210 | lastsecondary[0] = 0; | |
1211 | } | |
1212 | ||
1213 | /* Should not have an entry with no subtopic following one with a subtopic */ | |
1214 | ||
1215 | if (nosecondary && *lastsecondary) | |
1216 | error ("entry %s follows an entry with a secondary name", line); | |
1217 | ||
1218 | /* Start a new secondary entry if necessary */ | |
1219 | if (!nosecondary && strncmp (secondary, lastsecondary, secondarylength)) | |
1220 | { | |
1221 | if (pending) | |
1222 | { | |
1223 | fputs ("}\n", ostream); | |
1224 | pending = 0; | |
1225 | } | |
1226 | ||
1227 | /* Write the entry for the secondary. */ | |
1228 | fputs ("\\secondary {", ostream); | |
1229 | fwrite (secondary, secondarylength, 1, ostream); | |
1230 | fputs ("}{", ostream); | |
1231 | pending = 1; | |
1232 | ||
1233 | /* Record name of most recent secondary */ | |
1234 | if (lastsecondarylength < secondarylength) | |
1235 | { | |
1236 | lastsecondarylength = secondarylength + 100; | |
1237 | lastsecondary = (char *) xrealloc (lastsecondary, | |
1238 | 1 + lastsecondarylength); | |
1239 | } | |
1240 | strncpy (lastsecondary, secondary, secondarylength); | |
1241 | lastsecondary[secondarylength] = 0; | |
1242 | } | |
1243 | ||
1244 | /* Here to add one more page number to the current entry */ | |
1245 | if (pending++ != 1) | |
1246 | fputs (", ", ostream); /* Punctuate first, if this is not the first */ | |
1247 | fwrite (pagenumber, pagelength, 1, ostream); | |
1248 | } | |
1249 | ||
1250 | /* Close out any unfinished output entry */ | |
1251 | ||
1252 | void | |
1253 | finish_index (ostream) | |
1254 | FILE *ostream; | |
1255 | { | |
1256 | if (pending) | |
1257 | fputs ("}\n", ostream); | |
1258 | free (lastprimary); | |
1259 | free (lastsecondary); | |
1260 | } | |
1261 | \f | |
1262 | /* Copy the lines in the sorted order. | |
1263 | Each line is copied out of the input file it was found in. */ | |
1264 | ||
1265 | void | |
1266 | writelines (linearray, nlines, ostream) | |
1267 | char **linearray; | |
1268 | int nlines; | |
1269 | FILE *ostream; | |
1270 | { | |
1271 | char **stop_line = linearray + nlines; | |
1272 | char **next_line; | |
1273 | ||
1274 | init_index (); | |
1275 | ||
1276 | /* Output the text of the lines, and free the buffer space */ | |
1277 | ||
1278 | for (next_line = linearray; next_line != stop_line; next_line++) | |
1279 | { | |
1280 | /* If -u was specified, output the line only if distinct from previous one. */ | |
1281 | if (next_line == linearray | |
1282 | /* Compare previous line with this one, using only the explicitly specd keyfields */ | |
1283 | || compare_general (*(next_line - 1), *next_line, 0L, 0L, num_keyfields - 1)) | |
1284 | { | |
1285 | char *p = *next_line; | |
1286 | char c; | |
1287 | while ((c = *p++) && c != '\n'); | |
1288 | *(p-1) = 0; | |
1289 | indexify (*next_line, ostream); | |
1290 | } | |
1291 | } | |
1292 | ||
1293 | finish_index (ostream); | |
1294 | } | |
1295 | \f | |
1296 | /* Assume (and optionally verify) that each input file is sorted; | |
1297 | merge them and output the result. | |
1298 | Returns nonzero if any input file fails to be sorted. | |
1299 | ||
1300 | This is the high-level interface that can handle an unlimited number of files. */ | |
1301 | ||
1302 | #define MAX_DIRECT_MERGE 10 | |
1303 | ||
1304 | int | |
1305 | merge_files (infiles, nfiles, outfile) | |
1306 | char **infiles; | |
1307 | int nfiles; | |
1308 | char *outfile; | |
1309 | { | |
1310 | char **tempfiles; | |
1311 | int ntemps; | |
1312 | int i; | |
1313 | int value = 0; | |
1314 | int start_tempcount = tempcount; | |
1315 | ||
1316 | if (nfiles <= MAX_DIRECT_MERGE) | |
1317 | return merge_direct (infiles, nfiles, outfile); | |
1318 | ||
1319 | /* Merge groups of MAX_DIRECT_MERGE input files at a time, | |
1320 | making a temporary file to hold each group's result. */ | |
1321 | ||
1322 | ntemps = (nfiles + MAX_DIRECT_MERGE - 1) / MAX_DIRECT_MERGE; | |
1323 | tempfiles = (char **) xmalloc (ntemps * sizeof (char *)); | |
1324 | for (i = 0; i < ntemps; i++) | |
1325 | { | |
1326 | int nf = MAX_DIRECT_MERGE; | |
1327 | if (i + 1 == ntemps) | |
1328 | nf = nfiles - i * MAX_DIRECT_MERGE; | |
1329 | tempfiles[i] = maketempname (++tempcount); | |
1330 | value |= merge_direct (&infiles[i * MAX_DIRECT_MERGE], nf, tempfiles[i]); | |
1331 | } | |
1332 | ||
1333 | /* All temporary files that existed before are no longer needed | |
1334 | since their contents have been merged into our new tempfiles. | |
1335 | So delete them. */ | |
1336 | flush_tempfiles (start_tempcount); | |
1337 | ||
1338 | /* Now merge the temporary files we created. */ | |
1339 | ||
1340 | merge_files (tempfiles, ntemps, outfile); | |
1341 | ||
1342 | free (tempfiles); | |
1343 | ||
1344 | return value; | |
1345 | } | |
1346 | \f | |
1347 | /* Assume (and optionally verify) that each input file is sorted; | |
1348 | merge them and output the result. | |
1349 | Returns nonzero if any input file fails to be sorted. | |
1350 | ||
1351 | This version of merging will not work if the number of | |
1352 | input files gets too high. Higher level functions | |
1353 | use it only with a bounded number of input files. */ | |
1354 | ||
1355 | int | |
1356 | merge_direct (infiles, nfiles, outfile) | |
1357 | char **infiles; | |
1358 | int nfiles; | |
1359 | char *outfile; | |
1360 | { | |
1361 | char **ip = infiles; | |
1362 | struct linebuffer *lb1, *lb2; | |
1363 | struct linebuffer **thisline, **prevline; | |
1364 | FILE **streams; | |
1365 | int i; | |
1366 | int nleft; | |
1367 | int lossage = 0; | |
1368 | int *file_lossage; | |
1369 | struct linebuffer *prev_out = 0; | |
1370 | FILE *ostream = stdout; | |
1371 | ||
1372 | if (outfile) | |
1373 | { | |
1374 | ostream = fopen (outfile, "w"); | |
1375 | } | |
1376 | if (!ostream) pfatal_with_name (outfile); | |
1377 | ||
1378 | init_index (); | |
1379 | ||
1380 | if (nfiles == 0) | |
1381 | { | |
1382 | if (outfile) | |
1383 | fclose (ostream); | |
1384 | return 0; | |
1385 | } | |
1386 | ||
1387 | /* For each file, make two line buffers. | |
1388 | Also, for each file, there is an element of `thisline' | |
1389 | which points at any time to one of the file's two buffers, | |
1390 | and an element of `prevline' which points to the other buffer. | |
1391 | `thisline' is supposed to point to the next available line from the file, | |
1392 | while `prevline' holds the last file line used, | |
1393 | which is remembered so that we can verify that the file is properly sorted. */ | |
1394 | ||
1395 | /* lb1 and lb2 contain one buffer each per file */ | |
1396 | lb1 = (struct linebuffer *) xmalloc (nfiles * sizeof (struct linebuffer)); | |
1397 | lb2 = (struct linebuffer *) xmalloc (nfiles * sizeof (struct linebuffer)); | |
1398 | ||
1399 | /* thisline[i] points to the linebuffer holding the next available line in file i, | |
1400 | or is zero if there are no lines left in that file. */ | |
1401 | thisline = (struct linebuffer **) xmalloc (nfiles * sizeof (struct linebuffer *)); | |
1402 | /* prevline[i] points to the linebuffer holding the last used line from file i. | |
1403 | This is just for verifying that file i is properly sorted. */ | |
1404 | prevline = (struct linebuffer **) xmalloc (nfiles * sizeof (struct linebuffer *)); | |
1405 | /* streams[i] holds the input stream for file i. */ | |
1406 | streams = (FILE **) xmalloc (nfiles * sizeof (FILE *)); | |
1407 | /* file_lossage[i] is nonzero if we already know file i is not properly sorted. */ | |
1408 | file_lossage = (int *) xmalloc (nfiles * sizeof (int)); | |
1409 | ||
1410 | /* Allocate and initialize all that storage */ | |
1411 | ||
1412 | for (i = 0; i < nfiles; i++) | |
1413 | { | |
1414 | initbuffer (&lb1[i]); | |
1415 | initbuffer (&lb2[i]); | |
1416 | thisline[i] = &lb1[i]; | |
1417 | prevline[i] = &lb2[i]; | |
1418 | file_lossage[i] = 0; | |
1419 | streams[i] = fopen (infiles[i], "r"); | |
1420 | if (!streams[i]) | |
1421 | pfatal_with_name (infiles[i]); | |
1422 | ||
1423 | readline (thisline[i], streams[i]); | |
1424 | } | |
1425 | ||
1426 | /* Keep count of number of files not at eof */ | |
1427 | nleft = nfiles; | |
1428 | ||
1429 | while (nleft) | |
1430 | { | |
1431 | struct linebuffer *best = 0; | |
1432 | struct linebuffer *exch; | |
1433 | int bestfile = -1; | |
1434 | int i; | |
1435 | ||
1436 | /* Look at the next avail line of each file; choose the least one. */ | |
1437 | ||
1438 | for (i = 0; i < nfiles; i++) | |
1439 | { | |
1440 | if (thisline[i] && | |
1441 | (!best || | |
1442 | 0 < compare_general (best->buffer, thisline[i]->buffer, | |
1443 | (long) bestfile, (long) i, num_keyfields))) | |
1444 | { | |
1445 | best = thisline[i]; | |
1446 | bestfile = i; | |
1447 | } | |
1448 | } | |
1449 | ||
1450 | /* Output that line, unless it matches the previous one and we don't want duplicates */ | |
1451 | ||
1452 | if (!(prev_out && | |
1453 | !compare_general (prev_out->buffer, best->buffer, 0L, 1L, num_keyfields - 1))) | |
1454 | indexify (best->buffer, ostream); | |
1455 | prev_out = best; | |
1456 | ||
1457 | /* Now make the line the previous of its file, and fetch a new line from that file */ | |
1458 | ||
1459 | exch = prevline[bestfile]; | |
1460 | prevline[bestfile] = thisline[bestfile]; | |
1461 | thisline[bestfile] = exch; | |
1462 | ||
1463 | while (1) | |
1464 | { | |
1465 | /* If the file has no more, mark it empty */ | |
1466 | ||
1467 | if (feof (streams[bestfile])) | |
1468 | { | |
1469 | thisline[bestfile] = 0; | |
1470 | nleft--; /* Update the number of files still not empty */ | |
1471 | break; | |
1472 | } | |
1473 | readline (thisline[bestfile], streams[bestfile]); | |
1474 | if (thisline[bestfile]->buffer[0] || !feof (streams[bestfile])) break; | |
1475 | } | |
1476 | } | |
1477 | ||
1478 | finish_index (ostream); | |
1479 | ||
1480 | /* Free all storage and close all input streams */ | |
1481 | ||
1482 | for (i = 0; i < nfiles; i++) | |
1483 | { | |
1484 | fclose (streams[i]); | |
1485 | free (lb1[i].buffer); | |
1486 | free (lb2[i].buffer); | |
1487 | } | |
1488 | free (file_lossage); | |
1489 | free (lb1); | |
1490 | free (lb2); | |
1491 | free (thisline); | |
1492 | free (prevline); | |
1493 | free (streams); | |
1494 | ||
1495 | if (outfile) | |
1496 | fclose (ostream); | |
1497 | ||
1498 | return lossage; | |
1499 | } | |
1500 | \f | |
1501 | /* Print error message and exit. */ | |
1502 | ||
1503 | fatal (s1, s2) | |
1504 | char *s1, *s2; | |
1505 | { | |
1506 | error (s1, s2); | |
1507 | exit (EXIT_FATAL); | |
1508 | } | |
1509 | ||
1510 | /* Print error message. `s1' is printf control string, `s2' is arg for it. */ | |
1511 | ||
1512 | error (s1, s2) | |
1513 | char *s1, *s2; | |
1514 | { | |
1515 | printf ("texindex: "); | |
1516 | printf (s1, s2); | |
1517 | printf ("\n"); | |
1518 | } | |
1519 | ||
1520 | perror_with_name (name) | |
1521 | char *name; | |
1522 | { | |
1523 | char *s; | |
1524 | ||
1525 | if (errno < sys_nerr) | |
1526 | s = concat ("", sys_errlist[errno], " for %s"); | |
1527 | else | |
1528 | s = "cannot open %s"; | |
1529 | error (s, name); | |
1530 | } | |
1531 | ||
1532 | pfatal_with_name (name) | |
1533 | char *name; | |
1534 | { | |
1535 | char *s; | |
1536 | ||
1537 | if (errno < sys_nerr) | |
1538 | s = concat ("", sys_errlist[errno], " for %s"); | |
1539 | else | |
1540 | s = "cannot open %s"; | |
1541 | fatal (s, name); | |
1542 | } | |
1543 | ||
1544 | /* Return a newly-allocated string whose contents concatenate those of s1, s2, s3. */ | |
1545 | ||
1546 | char * | |
1547 | concat (s1, s2, s3) | |
1548 | char *s1, *s2, *s3; | |
1549 | { | |
1550 | int len1 = strlen (s1), len2 = strlen (s2), len3 = strlen (s3); | |
1551 | char *result = (char *) xmalloc (len1 + len2 + len3 + 1); | |
1552 | ||
1553 | strcpy (result, s1); | |
1554 | strcpy (result + len1, s2); | |
1555 | strcpy (result + len1 + len2, s3); | |
1556 | *(result + len1 + len2 + len3) = 0; | |
1557 | ||
1558 | return result; | |
1559 | } | |
1560 | ||
1561 | /* Like malloc but get fatal error if memory is exhausted. */ | |
1562 | ||
1563 | int | |
1564 | xmalloc (size) | |
1565 | int size; | |
1566 | { | |
1567 | int result = malloc (size); | |
1568 | if (!result) | |
1569 | fatal ("virtual memory exhausted", 0); | |
1570 | return result; | |
1571 | } | |
1572 | ||
1573 | ||
1574 | int | |
1575 | xrealloc (ptr, size) | |
1576 | char *ptr; | |
1577 | int size; | |
1578 | { | |
1579 | int result = realloc (ptr, size); | |
1580 | if (!result) | |
1581 | fatal ("virtual memory exhausted"); | |
1582 | return result; | |
1583 | } | |
1584 | ||
1585 | bzero (b, length) | |
1586 | register char *b; | |
1587 | register int length; | |
1588 | { | |
1589 | #ifdef VMS | |
1590 | short zero = 0; | |
1591 | long max_str = 65535; | |
1592 | long len; | |
1593 | ||
1594 | while (length > max_str) | |
1595 | { | |
1596 | (void) LIB$MOVC5 (&zero, &zero, &zero, &max_str, b); | |
1597 | length -= max_str; | |
1598 | b += max_str; | |
1599 | } | |
1600 | len = length; | |
1601 | (void) LIB$MOVC5 (&zero, &zero, &zero, &len, b); | |
1602 | #else | |
1603 | while (length-- > 0) | |
1604 | *b++ = 0; | |
1605 | #endif /* not VMS */ | |
1606 | } |