Commit | Line | Data |
---|---|---|
496af5c8 TV |
1 | #!/bin/sh |
2 | ||
3 | # Copyright (C) 2019 Free Software Foundation, Inc. | |
4 | # This program is free software; you can redistribute it and/or modify | |
5 | # it under the terms of the GNU General Public License as published by | |
6 | # the Free Software Foundation; either version 3 of the License, or | |
7 | # (at your option) any later version. | |
8 | # | |
9 | # This program is distributed in the hope that it will be useful, | |
10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
12 | # GNU General Public License for more details. | |
13 | # | |
14 | # You should have received a copy of the GNU General Public License | |
15 | # along with this program. If not, see <http://www.gnu.org/licenses/>. | |
16 | ||
3cf2f237 | 17 | # This script intends to facilitate spell checking of source/doc files. |
496af5c8 | 18 | # It: |
3cf2f237 | 19 | # - transforms the files into a list of lowercase words |
496af5c8 TV |
20 | # - prefixes each word with the frequency |
21 | # - filters out words within a frequency range | |
22 | # - sorts the words, longest first | |
23 | # | |
3cf2f237 TV |
24 | # If '-c' is passed as option, it operates on the C comments only, rather than |
25 | # on the entire file. | |
26 | # | |
496af5c8 TV |
27 | # For: |
28 | # ... | |
3cf2f237 TV |
29 | # $ files=$(find gdb -type f -name "*.c" -o -name "*.h") |
30 | # $ ./gdb/contrib/words.sh -c $files | |
496af5c8 TV |
31 | # ... |
32 | # it generates a list of ~15000 words prefixed with frequency. | |
33 | # | |
34 | # This could be used to generate a dictionary that is kept as part of the | |
35 | # sources, against which new code can be checked, generating a warning or | |
36 | # error. The hope is that misspellings would trigger this frequently, and rare | |
37 | # words rarely, otherwise the burden of updating the dictionary would be too | |
38 | # much. | |
39 | # | |
40 | # And for: | |
41 | # ... | |
3cf2f237 TV |
42 | # $ files=$(find gdb -type f -name "*.c" -o -name "*.h") |
43 | # $ ./gdb/contrib/words.sh -c -f 1 $files | |
496af5c8 TV |
44 | # ... |
45 | # it generates a list of ~5000 words with frequency 1. | |
46 | # | |
47 | # This can be used to scan for misspellings manually. | |
48 | # | |
49 | ||
50 | minfreq= | |
51 | maxfreq= | |
3cf2f237 | 52 | c=false |
496af5c8 TV |
53 | while [ $# -gt 0 ]; do |
54 | case "$1" in | |
3cf2f237 TV |
55 | -c) |
56 | c=true | |
57 | shift | |
58 | ;; | |
496af5c8 TV |
59 | --freq|-f) |
60 | minfreq=$2 | |
61 | maxfreq=$2 | |
62 | shift 2 | |
63 | ;; | |
64 | --min) | |
65 | minfreq=$2 | |
66 | if [ "$maxfreq" = "" ]; then | |
67 | maxfreq=0 | |
68 | fi | |
69 | shift 2 | |
70 | ;; | |
71 | --max) | |
72 | maxfreq=$2 | |
73 | if [ "$minfreq" = "" ]; then | |
74 | minfreq=0 | |
75 | fi | |
76 | shift 2 | |
77 | ;; | |
78 | *) | |
79 | break; | |
80 | ;; | |
81 | esac | |
82 | done | |
83 | ||
84 | if [ "$minfreq" = "" ] && [ "$maxfreq" = "" ]; then | |
85 | minfreq=0 | |
86 | maxfreq=0 | |
87 | fi | |
88 | ||
89 | awkfile=$(mktemp) | |
90 | trap 'rm -f "$awkfile"' EXIT | |
91 | ||
92 | cat > "$awkfile" <<EOF | |
93 | BEGIN { | |
94 | in_comment=0 | |
95 | } | |
96 | ||
97 | // { | |
98 | line=\$0 | |
99 | } | |
100 | ||
101 | /\/\*/ { | |
102 | in_comment=1 | |
103 | sub(/.*\/\*/, "", line) | |
104 | } | |
105 | ||
106 | /\*\// { | |
107 | sub(/\*\/.*/, "", line) | |
108 | in_comment=0 | |
109 | print line | |
110 | next | |
111 | } | |
112 | ||
113 | // { | |
114 | if (in_comment) { | |
115 | print line | |
116 | } | |
117 | } | |
118 | EOF | |
119 | ||
120 | # Stabilize sort. | |
121 | export LC_ALL=C | |
122 | ||
3cf2f237 TV |
123 | if $c; then |
124 | awk \ | |
125 | -f "$awkfile" \ | |
126 | -- "$@" | |
127 | else | |
128 | cat "$@" | |
129 | fi \ | |
f6180073 | 130 | | sed \ |
85e7588d | 131 | -e 's/[!"?;:%^$~#{}`&=@,. \t\/_()|<>\+\*-]/\n/g' \ |
f6180073 TV |
132 | -e 's/\[/\n/g' \ |
133 | -e 's/\]/\n/g' \ | |
85e7588d | 134 | -e "s/'/\n/g" \ |
f6180073 TV |
135 | -e 's/[0-9][0-9]*/\n/g' \ |
136 | -e 's/[ \t]*//g' \ | |
496af5c8 | 137 | | tr '[:upper:]' '[:lower:]' \ |
496af5c8 TV |
138 | | sort \ |
139 | | uniq -c \ | |
140 | | awk "{ if (($minfreq == 0 || $minfreq <= \$1) \ | |
141 | && ($maxfreq == 0 || \$1 <= $maxfreq)) { print \$0; } }" \ | |
142 | | awk '{ print length($0) " " $0; }' \ | |
143 | | sort -n -r \ | |
144 | | cut -d ' ' -f 2- |