Savannah Git Hosting - coreutils.git/blob

1 /* sort - sort lines of text (with all kinds of options).

4 This program is free software: you can redistribute it and/or modify

5 it under the terms of the GNU General Public License as published by

6 the Free Software Foundation, either version 3 of the License, or

7 (at your option) any later version.

9 This program is distributed in the hope that it will be useful,

10 but WITHOUT ANY WARRANTY; without even the implied warranty of

11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

12 GNU General Public License for more details.

14 You should have received a copy of the GNU General Public License

15 along with this program. If not, see <http://www.gnu.org/licenses/>.

17 Written December 1988 by Mike Haertel.

18 The author may be reached (Email) at the address mike@gnu.ai.mit.edu,

19 or (US mail) as Mike Haertel c/o Free Software Foundation.

21 Ørn E. Hansen added NLS support in 1997. */

23 #include <config.h>

25 #include <getopt.h>

26 #include <sys/types.h>

27 #include <sys/wait.h>

28 #include <signal.h>

29 #include "system.h"

30 #include "argmatch.h"

31 #include "error.h"

32 #include "filevercmp.h"

33 #include "hash.h"

34 #include "md5.h"

35 #include "physmem.h"

36 #include "posixver.h"

37 #include "quote.h"

38 #include "quotearg.h"

39 #include "randread.h"

40 #include "readtokens0.h"

41 #include "stdio--.h"

42 #include "stdlib--.h"

43 #include "strnumcmp.h"

44 #include "xmemcoll.h"

45 #include "xmemxfrm.h"

46 #include "xstrtol.h"

48 #if HAVE_SYS_RESOURCE_H

49 # include <sys/resource.h>

50 #endif

51 #ifndef RLIMIT_DATA

52 struct rlimit { size_t rlim_cur; };

53 # define getrlimit(Resource, Rlp) (-1)

54 #endif

56 /* The official name of this program (e.g., no `g' prefix). */

57 #define PROGRAM_NAME "sort"

59 #define AUTHORS \

60 proper_name ("Mike Haertel"), \

61 proper_name ("Paul Eggert")

63 #if HAVE_LANGINFO_CODESET

64 # include <langinfo.h>

65 #endif

67 /* Use SA_NOCLDSTOP as a proxy for whether the sigaction machinery is

68 present. */

69 #ifndef SA_NOCLDSTOP

70 # define SA_NOCLDSTOP 0

71 /* No sigprocmask. Always 'return' zero. */

72 # define sigprocmask(How, Set, Oset) (0)

73 # define sigset_t int

74 # if ! HAVE_SIGINTERRUPT

75 # define siginterrupt(sig, flag) /* empty */

76 # endif

77 #endif

79 #if !defined OPEN_MAX && defined NR_OPEN

80 # define OPEN_MAX NR_OPEN

81 #endif

82 #if !defined OPEN_MAX

83 # define OPEN_MAX 20

84 #endif

86 #define UCHAR_LIM (UCHAR_MAX + 1)

88 #ifndef DEFAULT_TMPDIR

89 # define DEFAULT_TMPDIR "/tmp"

90 #endif

92 /* Exit statuses. */

93 enum

94 {

95 /* POSIX says to exit with status 1 if invoked with -c and the

96 input is not properly sorted. */

97 SORT_OUT_OF_ORDER = 1,

99 /* POSIX says any other irregular exit must exit with a status

100 code greater than 1. */

101 SORT_FAILURE = 2

102 };

103

104 enum

105 {

106 /* The number of times we should try to fork a compression process

107 (we retry if the fork call fails). We don't _need_ to compress

108 temp files, this is just to reduce disk access, so this number

109 can be small. */

110 MAX_FORK_TRIES_COMPRESS = 2,

111

112 /* The number of times we should try to fork a decompression process.

113 If we can't fork a decompression process, we can't sort, so this

114 number should be big. */

115 MAX_FORK_TRIES_DECOMPRESS = 8

116 };

117

118 /* The representation of the decimal point in the current locale. */

119 static int decimal_point;

120

121 /* Thousands separator; if -1, then there isn't one. */

122 static int thousands_sep;

123

124 /* Nonzero if the corresponding locales are hard. */

125 static bool hard_LC_COLLATE;

126 #if HAVE_NL_LANGINFO

127 static bool hard_LC_TIME;

128 #endif

129

130 #define NONZERO(x) ((x) != 0)

131

132 /* The kind of blanks for '-b' to skip in various options. */

133 enum blanktype { bl_start, bl_end, bl_both };

134

135 /* The character marking end of line. Default to \n. */

136 static char eolchar = '\n';

137

138 /* Lines are held in core as counted strings. */

139 struct line

140 {

141 char *text; /* Text of the line. */

142 size_t length; /* Length including final newline. */

143 char *keybeg; /* Start of first key. */

144 char *keylim; /* Limit of first key. */

145 };

146

147 /* Input buffers. */

148 struct buffer

149 {

150 char *buf; /* Dynamically allocated buffer,

151 partitioned into 3 regions:

152 - input data;

153 - unused area;

154 - an array of lines, in reverse order. */

155 size_t used; /* Number of bytes used for input data. */

156 size_t nlines; /* Number of lines in the line array. */

157 size_t alloc; /* Number of bytes allocated. */

158 size_t left; /* Number of bytes left from previous reads. */

159 size_t line_bytes; /* Number of bytes to reserve for each line. */

160 bool eof; /* An EOF has been read. */

161 };

162

163 struct keyfield

164 {

165 size_t sword; /* Zero-origin 'word' to start at. */

166 size_t schar; /* Additional characters to skip. */

167 size_t eword; /* Zero-origin first word after field. */

168 size_t echar; /* Additional characters in field. */

169 bool const *ignore; /* Boolean array of characters to ignore. */

170 char const *translate; /* Translation applied to characters. */

171 bool skipsblanks; /* Skip leading blanks when finding start. */

172 bool skipeblanks; /* Skip leading blanks when finding end. */

173 bool numeric; /* Flag for numeric comparison. Handle

174 strings of digits with optional decimal

175 point, but no exponential notation. */

176 bool random; /* Sort by random hash of key. */

177 bool general_numeric; /* Flag for general, numeric comparison.

178 Handle numbers in exponential notation. */

179 bool month; /* Flag for comparison by month name. */

180 bool reverse; /* Reverse the sense of comparison. */

181 bool version; /* sort by version number */

182 struct keyfield *next; /* Next keyfield to try. */

183 };

184

185 struct month

186 {

187 char const *name;

188 int val;

189 };

190

191 /* FIXME: None of these tables work with multibyte character sets.

192 Also, there are many other bugs when handling multibyte characters.

193 One way to fix this is to rewrite `sort' to use wide characters

194 internally, but doing this with good performance is a bit

195 tricky. */

196

197 /* Table of blanks. */

198 static bool blanks[UCHAR_LIM];

199

200 /* Table of non-printing characters. */

201 static bool nonprinting[UCHAR_LIM];

202

203 /* Table of non-dictionary characters (not letters, digits, or blanks). */

204 static bool nondictionary[UCHAR_LIM];

205

206 /* Translation table folding lower case to upper. */

207 static char fold_toupper[UCHAR_LIM];

208

209 #define MONTHS_PER_YEAR 12

210

211 /* Table mapping month names to integers.

212 Alphabetic order allows binary search. */

213 static struct month monthtab[] =

214 {

215 {"APR", 4},

216 {"AUG", 8},

217 {"DEC", 12},

218 {"FEB", 2},

219 {"JAN", 1},

220 {"JUL", 7},

221 {"JUN", 6},

222 {"MAR", 3},

223 {"MAY", 5},

224 {"NOV", 11},

225 {"OCT", 10},

226 {"SEP", 9}

227 };

228

229 /* During the merge phase, the number of files to merge at once. */

230 #define NMERGE_DEFAULT 16

231

232 /* Minimum size for a merge or check buffer. */

233 #define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line))

234

235 /* Minimum sort size; the code might not work with smaller sizes. */

236 #define MIN_SORT_SIZE (nmerge * MIN_MERGE_BUFFER_SIZE)

237

238 /* The number of bytes needed for a merge or check buffer, which can

239 function relatively efficiently even if it holds only one line. If

240 a longer line is seen, this value is increased. */

241 static size_t merge_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024);

242

243 /* The approximate maximum number of bytes of main memory to use, as

244 specified by the user. Zero if the user has not specified a size. */

245 static size_t sort_size;

246

247 /* The guessed size for non-regular files. */

248 #define INPUT_FILE_SIZE_GUESS (1024 * 1024)

249

250 /* Array of directory names in which any temporary files are to be created. */

251 static char const **temp_dirs;

252

253 /* Number of temporary directory names used. */

254 static size_t temp_dir_count;

255

256 /* Number of allocated slots in temp_dirs. */

257 static size_t temp_dir_alloc;

258

259 /* Flag to reverse the order of all comparisons. */

260 static bool reverse;

261

262 /* Flag for stable sort. This turns off the last ditch bytewise

263 comparison of lines, and instead leaves lines in the same order

264 they were read if all keys compare equal. */

265 static bool stable;

266

267 /* If TAB has this value, blanks separate fields. */

268 enum { TAB_DEFAULT = CHAR_MAX + 1 };

269

270 /* Tab character separating fields. If TAB_DEFAULT, then fields are

271 separated by the empty string between a non-blank character and a blank

272 character. */

273 static int tab = TAB_DEFAULT;

274

275 /* Flag to remove consecutive duplicate lines from the output.

276 Only the last of a sequence of equal lines will be output. */

277 static bool unique;

278

279 /* Nonzero if any of the input files are the standard input. */

280 static bool have_read_stdin;

281

282 /* List of key field comparisons to be tried. */

283 static struct keyfield *keylist;

284

285 /* Program used to (de)compress temp files. Must accept -d. */

286 static char const *compress_program;

287

288 /* Maximum number of files to merge in one go. If more than this

289 number are present, temp files will be used. */

290 static unsigned int nmerge = NMERGE_DEFAULT;

291

292 static void sortlines_temp (struct line *, size_t, struct line *);

293

294 /* Report MESSAGE for FILE, then clean up and exit.

295 If FILE is null, it represents standard output. */

296

297 static void die (char const *, char const *) ATTRIBUTE_NORETURN;

298 static void

299 die (char const *message, char const *file)

300 {

301 error (0, errno, "%s: %s", message, file ? file : _("standard output"));

302 exit (SORT_FAILURE);

303 }

304

305 void

306 usage (int status)

307 {

308 if (status != EXIT_SUCCESS)

309 fprintf (stderr, _("Try `%s --help' for more information.\n"),

310 program_name);

311 else

312 {

313 printf (_("\

314 Usage: %s [OPTION]... [FILE]...\n\

315 or: %s [OPTION]... --files0-from=F\n\

316 "),

317 program_name, program_name);

318 fputs (_("\

319 Write sorted concatenation of all FILE(s) to standard output.\n\

320 \n\

321 "), stdout);

322 fputs (_("\

323 Mandatory arguments to long options are mandatory for short options too.\n\

324 "), stdout);

325 fputs (_("\

326 Ordering options:\n\

327 \n\

328 "), stdout);

329 fputs (_("\

330 -b, --ignore-leading-blanks ignore leading blanks\n\

331 -d, --dictionary-order consider only blanks and alphanumeric characters\n\

332 -f, --ignore-case fold lower case to upper case characters\n\

333 "), stdout);

334 fputs (_("\

335 -g, --general-numeric-sort compare according to general numerical value\n\

336 -i, --ignore-nonprinting consider only printable characters\n\

337 -M, --month-sort compare (unknown) < `JAN' < ... < `DEC'\n\

338 "), stdout);

339 fputs (_("\

340 -n, --numeric-sort compare according to string numerical value\n\

341 -R, --random-sort sort by random hash of keys\n\

342 --random-source=FILE get random bytes from FILE (default /dev/urandom)\n\

343 -r, --reverse reverse the result of comparisons\n\

344 "), stdout);

345 fputs (_("\

346 --sort=WORD sort according to WORD:\n\

347 general-numeric -g, month -M, numeric -n,\n\

348 random -R, version -V\n\

349 -V, --version-sort sort by numeric version\n\

350 \n\

351 "), stdout);

352 fputs (_("\

353 Other options:\n\

354 \n\

355 "), stdout);

356 fputs (_("\

357 --batch-size=NMERGE merge at most NMERGE inputs at once;\n\

358 for more use temp files\n\

359 "), stdout);

360 fputs (_("\

361 -c, --check, --check=diagnose-first check for sorted input; do not sort\n\

362 -C, --check=quiet, --check=silent like -c, but do not report first bad line\n\

363 --compress-program=PROG compress temporaries with PROG;\n\

364 decompress them with PROG -d\n\

365 --files0-from=F read input from the files specified by\n\

366 NUL-terminated names in file F;\n\

367 If F is - then read names from standard input\n\

368 "), stdout);

369 fputs (_("\

370 -k, --key=POS1[,POS2] start a key at POS1 (origin 1), end it at POS2\n\

371 (default end of line)\n\

372 -m, --merge merge already sorted files; do not sort\n\

373 "), stdout);

374 fputs (_("\

375 -o, --output=FILE write result to FILE instead of standard output\n\

376 -s, --stable stabilize sort by disabling last-resort comparison\n\

377 -S, --buffer-size=SIZE use SIZE for main memory buffer\n\

378 "), stdout);

379 printf (_("\

380 -t, --field-separator=SEP use SEP instead of non-blank to blank transition\n\

381 -T, --temporary-directory=DIR use DIR for temporaries, not $TMPDIR or %s;\n\

382 multiple options specify multiple directories\n\

383 -u, --unique with -c, check for strict ordering;\n\

384 without -c, output only the first of an equal run\n\

385 "), DEFAULT_TMPDIR);

386 fputs (_("\

387 -z, --zero-terminated end lines with 0 byte, not newline\n\

388 "), stdout);

389 fputs (HELP_OPTION_DESCRIPTION, stdout);

390 fputs (VERSION_OPTION_DESCRIPTION, stdout);

391 fputs (_("\

392 \n\

393 POS is F[.C][OPTS], where F is the field number and C the character position\n\

394 in the field; both are origin 1. If neither -t nor -b is in effect, characters\n\

395 in a field are counted from the beginning of the preceding whitespace. OPTS is\n\

396 one or more single-letter ordering options, which override global ordering\n\

397 options for that key. If no key is given, use the entire line as the key.\n\

398 \n\

399 SIZE may be followed by the following multiplicative suffixes:\n\

400 "), stdout);

401 fputs (_("\

402 % 1% of memory, b 1, K 1024 (default), and so on for M, G, T, P, E, Z, Y.\n\

403 \n\

404 With no FILE, or when FILE is -, read standard input.\n\

405 \n\

406 *** WARNING ***\n\

407 The locale specified by the environment affects sort order.\n\

408 Set LC_ALL=C to get the traditional sort order that uses\n\

409 native byte values.\n\

410 "), stdout );

411 emit_bug_reporting_address ();

412 }

413

414 exit (status);

415 }

416

417 /* For long options that have no equivalent short option, use a

418 non-character as a pseudo short option, starting with CHAR_MAX + 1. */

419 enum

420 {

421 CHECK_OPTION = CHAR_MAX + 1,

422 COMPRESS_PROGRAM_OPTION,

423 FILES0_FROM_OPTION,

424 NMERGE_OPTION,

425 RANDOM_SOURCE_OPTION,

426 SORT_OPTION

427 };

428

429 static char const short_options[] = "-bcCdfgik:mMno:rRsS:t:T:uVy:z";

430

431 static struct option const long_options[] =

432 {

433 {"ignore-leading-blanks", no_argument, NULL, 'b'},

434 {"check", optional_argument, NULL, CHECK_OPTION},

435 {"compress-program", required_argument, NULL, COMPRESS_PROGRAM_OPTION},

436 {"dictionary-order", no_argument, NULL, 'd'},

437 {"ignore-case", no_argument, NULL, 'f'},

438 {"files0-from", required_argument, NULL, FILES0_FROM_OPTION},

439 {"general-numeric-sort", no_argument, NULL, 'g'},

440 {"ignore-nonprinting", no_argument, NULL, 'i'},

441 {"key", required_argument, NULL, 'k'},

442 {"merge", no_argument, NULL, 'm'},

443 {"month-sort", no_argument, NULL, 'M'},

444 {"numeric-sort", no_argument, NULL, 'n'},

445 {"version-sort", no_argument, NULL, 'V'},

446 {"random-sort", no_argument, NULL, 'R'},

447 {"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION},

448 {"sort", required_argument, NULL, SORT_OPTION},

449 {"output", required_argument, NULL, 'o'},

450 {"reverse", no_argument, NULL, 'r'},

451 {"stable", no_argument, NULL, 's'},

452 {"batch-size", required_argument, NULL, NMERGE_OPTION},

453 {"buffer-size", required_argument, NULL, 'S'},

454 {"field-separator", required_argument, NULL, 't'},

455 {"temporary-directory", required_argument, NULL, 'T'},

456 {"unique", no_argument, NULL, 'u'},

457 {"zero-terminated", no_argument, NULL, 'z'},

458 {GETOPT_HELP_OPTION_DECL},

459 {GETOPT_VERSION_OPTION_DECL},

460 {NULL, 0, NULL, 0},

461 };

462

463 #define CHECK_TABLE \

464 _ct_("quiet", 'C') \

465 _ct_("silent", 'C') \

466 _ct_("diagnose-first", 'c')

467

468 static char const *const check_args[] =

469 {

470 #define _ct_(_s, _c) _s,

471 CHECK_TABLE NULL

472 #undef _ct_

473 };

474 static char const check_types[] =

475 {

476 #define _ct_(_s, _c) _c,

477 CHECK_TABLE

478 #undef _ct_

479 };

480

481 #define SORT_TABLE \

482 _st_("general-numeric", 'g') \

483 _st_("month", 'M') \

484 _st_("numeric", 'n') \

485 _st_("random", 'R') \

486 _st_("version", 'V')

487

488 static char const *const sort_args[] =

489 {

490 #define _st_(_s, _c) _s,

491 SORT_TABLE NULL

492 #undef _st_

493 };

494 static char const sort_types[] =

495 {

496 #define _st_(_s, _c) _c,

497 SORT_TABLE

498 #undef _st_

499 };

500

501 /* The set of signals that are caught. */

502 static sigset_t caught_signals;

503

504 /* Critical section status. */

505 struct cs_status

506 {

507 bool valid;

508 sigset_t sigs;

509 };

510

511 /* Enter a critical section. */

512 static struct cs_status

513 cs_enter (void)

514 {

515 struct cs_status status;

516 status.valid = (sigprocmask (SIG_BLOCK, &caught_signals, &status.sigs) == 0);

517 return status;

518 }

519

520 /* Leave a critical section. */

521 static void

522 cs_leave (struct cs_status status)

523 {

524 if (status.valid)

525 {

526 /* Ignore failure when restoring the signal mask. */

527 sigprocmask (SIG_SETMASK, &status.sigs, NULL);

528 }

529 }

530

531 /* The list of temporary files. */

532 struct tempnode

533 {

534 struct tempnode *volatile next;

535 pid_t pid; /* If compressed, the pid of compressor, else zero */

536 char name[1]; /* Actual size is 1 + file name length. */

537 };

538 static struct tempnode *volatile temphead;

539 static struct tempnode *volatile *temptail = &temphead;

540

541 struct sortfile

542 {

543 char const *name;

544 pid_t pid; /* If compressed, the pid of compressor, else zero */

545 };

546

547 /* A table where we store compression process states. We clean up all

548 processes in a timely manner so as not to exhaust system resources,

549 so we store the info on whether the process is still running, or has

550 been reaped here. */

551 static Hash_table *proctab;

552

553 enum { INIT_PROCTAB_SIZE = 47 };

554

555 enum procstate { ALIVE, ZOMBIE };

556

557 /* A proctab entry. The COUNT field is there in case we fork a new

558 compression process that has the same PID as an old zombie process

559 that is still in the table (because the process to decompress the

560 temp file it was associated with hasn't started yet). */

561 struct procnode

562 {

563 pid_t pid;

564 enum procstate state;

565 size_t count;

566 };

567

568 static size_t

569 proctab_hasher (const void *entry, size_t tabsize)

570 {

571 const struct procnode *node = entry;

572 return node->pid % tabsize;

573 }

574

575 static bool

576 proctab_comparator (const void *e1, const void *e2)

577 {

578 const struct procnode *n1 = e1, *n2 = e2;

579 return n1->pid == n2->pid;

580 }

581

582 /* The total number of forked processes (compressors and decompressors)

583 that have not been reaped yet. */

584 static size_t nprocs;

585

586 /* The number of child processes we'll allow before we try to reap some. */

587 enum { MAX_PROCS_BEFORE_REAP = 2 };

588

589 /* If 0 < PID, wait for the child process with that PID to exit.

590 If PID is -1, clean up a random child process which has finished and

591 return the process ID of that child. If PID is -1 and no processes

592 have quit yet, return 0 without waiting. */

593

594 static pid_t

595 reap (pid_t pid)

596 {

597 int status;

598 pid_t cpid = waitpid (pid, &status, pid < 0 ? WNOHANG : 0);

599

600 if (cpid < 0)

601 error (SORT_FAILURE, errno, _("waiting for %s [-d]"),

602 compress_program);

603 else if (0 < cpid)

604 {

605 if (! WIFEXITED (status) || WEXITSTATUS (status))

606 error (SORT_FAILURE, 0, _("%s [-d] terminated abnormally"),

607 compress_program);

608 --nprocs;

609 }

610

611 return cpid;

612 }

613

614 /* Add the PID of a running compression process to proctab, or update

615 the entry COUNT and STATE fields if it's already there. This also

616 creates the table for us the first time it's called. */

617

618 static void

619 register_proc (pid_t pid)

620 {

621 struct procnode test, *node;

622

623 if (! proctab)

624 {

625 proctab = hash_initialize (INIT_PROCTAB_SIZE, NULL,

626 proctab_hasher,

627 proctab_comparator,

628 free);

629 if (! proctab)

630 xalloc_die ();

631 }

632

633 test.pid = pid;

634 node = hash_lookup (proctab, &test);

635 if (node)

636 {

637 node->state = ALIVE;

638 ++node->count;

639 }

640 else

641 {

642 node = xmalloc (sizeof *node);

643 node->pid = pid;

644 node->state = ALIVE;

645 node->count = 1;

646 hash_insert (proctab, node);

647 }

648 }

649

650 /* This is called when we reap a random process. We don't know

651 whether we have reaped a compression process or a decompression

652 process until we look in the table. If there's an ALIVE entry for

653 it, then we have reaped a compression process, so change the state

654 to ZOMBIE. Otherwise, it's a decompression processes, so ignore it. */

655

656 static void

657 update_proc (pid_t pid)

658 {

659 struct procnode test, *node;

660

661 test.pid = pid;

662 node = hash_lookup (proctab, &test);

663 if (node)

664 node->state = ZOMBIE;

665 }

666

667 /* This is for when we need to wait for a compression process to exit.

668 If it has a ZOMBIE entry in the table then it's already dead and has

669 been reaped. Note that if there's an ALIVE entry for it, it still may

670 already have died and been reaped if a second process was created with

671 the same PID. This is probably exceedingly rare, but to be on the safe

672 side we will have to wait for any compression process with this PID. */

673

674 static void

675 wait_proc (pid_t pid)

676 {

677 struct procnode test, *node;

678

679 test.pid = pid;

680 node = hash_lookup (proctab, &test);

681 if (node->state == ALIVE)

682 reap (pid);

683

684 node->state = ZOMBIE;

685 if (! --node->count)

686 {

687 hash_delete (proctab, node);

688 free (node);

689 }

690 }

691

692 /* Keep reaping finished children as long as there are more to reap.

693 This doesn't block waiting for any of them, it only reaps those

694 that are already dead. */

695

696 static void

697 reap_some (void)

698 {

699 pid_t pid;

700

701 while (0 < nprocs && (pid = reap (-1)))

702 update_proc (pid);

703 }

704

705 /* Clean up any remaining temporary files. */

706

707 static void

708 cleanup (void)

709 {

710 struct tempnode const *node;

711

712 for (node = temphead; node; node = node->next)

713 unlink (node->name);

714 temphead = NULL;

715 }

716

717 /* Cleanup actions to take when exiting. */

718

719 static void

720 exit_cleanup (void)

721 {

722 if (temphead)

723 {

724 /* Clean up any remaining temporary files in a critical section so

725 that a signal handler does not try to clean them too. */

726 struct cs_status cs = cs_enter ();

727 cleanup ();

728 cs_leave (cs);

729 }

730

731 close_stdout ();

732 }

733

734 /* Create a new temporary file, returning its newly allocated tempnode.

735 Store into *PFD the file descriptor open for writing. */

736

737 static struct tempnode *

738 create_temp_file (int *pfd)

739 {

740 static char const slashbase[] = "/sortXXXXXX";

741 static size_t temp_dir_index;

742 int fd;

743 int saved_errno;

744 char const *temp_dir = temp_dirs[temp_dir_index];

745 size_t len = strlen (temp_dir);

746 struct tempnode *node =

747 xmalloc (offsetof (struct tempnode, name) + len + sizeof slashbase);

748 char *file = node->name;

749 struct cs_status cs;

750

751 memcpy (file, temp_dir, len);

752 memcpy (file + len, slashbase, sizeof slashbase);

753 node->next = NULL;

754 node->pid = 0;

755 if (++temp_dir_index == temp_dir_count)

756 temp_dir_index = 0;

757

758 /* Create the temporary file in a critical section, to avoid races. */

759 cs = cs_enter ();

760 fd = mkstemp (file);

761 if (0 <= fd)

762 {

763 *temptail = node;

764 temptail = &node->next;

765 }

766 saved_errno = errno;

767 cs_leave (cs);

768 errno = saved_errno;

769

770 if (fd < 0)

771 error (SORT_FAILURE, errno, _("cannot create temporary file in %s"),

772 quote (temp_dir));

773

774 *pfd = fd;

775 return node;

776 }

777

778 /* Return a stream for FILE, opened with mode HOW. A null FILE means

779 standard output; HOW should be "w". When opening for input, "-"

780 means standard input. To avoid confusion, do not return file

781 descriptors STDIN_FILENO, STDOUT_FILENO, or STDERR_FILENO when

782 opening an ordinary FILE. */

783

784 static FILE *

785 xfopen (const char *file, const char *how)

786 {

787 FILE *fp;

788

789 if (!file)

790 fp = stdout;

791 else if (STREQ (file, "-") && *how == 'r')

792 {

793 have_read_stdin = true;

794 fp = stdin;

795 }

796 else

797 {

798 fp = fopen (file, how);

799 if (! fp)

800 die (_("open failed"), file);

801 }

802

803 return fp;

804 }

805

806 /* Close FP, whose name is FILE, and report any errors. */

807

808 static void

809 xfclose (FILE *fp, char const *file)

810 {

811 switch (fileno (fp))

812 {

813 case STDIN_FILENO:

814 /* Allow reading stdin from tty more than once. */

815 if (feof (fp))

816 clearerr (fp);

817 break;

818

819 case STDOUT_FILENO:

820 /* Don't close stdout just yet. close_stdout does that. */

821 if (fflush (fp) != 0)

822 die (_("fflush failed"), file);

823 break;

824

825 default:

826 if (fclose (fp) != 0)

827 die (_("close failed"), file);

828 break;

829 }

830 }

831

832 static void

833 dup2_or_die (int oldfd, int newfd)

834 {

835 if (dup2 (oldfd, newfd) < 0)

836 error (SORT_FAILURE, errno, _("dup2 failed"));

837 }

838

839 /* Fork a child process for piping to and do common cleanup. The

840 TRIES parameter tells us how many times to try to fork before

841 giving up. Return the PID of the child or -1 if fork failed. */

842

843 static pid_t

844 pipe_fork (int pipefds[2], size_t tries)

845 {

846 #if HAVE_WORKING_FORK

847 struct tempnode *saved_temphead;

848 int saved_errno;

849 unsigned int wait_retry = 1;

850 pid_t pid IF_LINT (= -1);

851 struct cs_status cs;

852

853 if (pipe (pipefds) < 0)

854 return -1;

855

856 while (tries--)

857 {

858 /* This is so the child process won't delete our temp files

859 if it receives a signal before exec-ing. */

860 cs = cs_enter ();

861 saved_temphead = temphead;

862 temphead = NULL;

863

864 pid = fork ();

865 saved_errno = errno;

866 if (pid)

867 temphead = saved_temphead;

868

869 cs_leave (cs);

870 errno = saved_errno;

871

872 if (0 <= pid || errno != EAGAIN)

873 break;

874 else

875 {

876 sleep (wait_retry);

877 wait_retry *= 2;

878 reap_some ();

879 }

880 }

881

882 if (pid < 0)

883 {

884 close (pipefds[0]);

885 close (pipefds[1]);

886 }

887 else if (pid == 0)

888 {

889 close (STDIN_FILENO);

890 close (STDOUT_FILENO);

891 }

892 else

893 ++nprocs;

894

895 return pid;

896

897 #else /* ! HAVE_WORKING_FORK */

898 return -1;

899 #endif

900 }

901

902 /* Create a temporary file and start a compression program to filter output

903 to that file. Set *PFP to the file handle and if *PPID is non-NULL,

904 set it to the PID of the newly-created process. */

905

906 static char *

907 create_temp (FILE **pfp, pid_t *ppid)

908 {

909 int tempfd;

910 struct tempnode *node = create_temp_file (&tempfd);

911 char *name = node->name;

912

913 if (compress_program)

914 {

915 int pipefds[2];

916

917 node->pid = pipe_fork (pipefds, MAX_FORK_TRIES_COMPRESS);

918 if (0 < node->pid)

919 {

920 close (tempfd);

921 close (pipefds[0]);

922 tempfd = pipefds[1];

923

924 register_proc (node->pid);

925 }

926 else if (node->pid == 0)

927 {

928 close (pipefds[1]);

929 dup2_or_die (tempfd, STDOUT_FILENO);

930 close (tempfd);

931 dup2_or_die (pipefds[0], STDIN_FILENO);

932 close (pipefds[0]);

933

934 if (execlp (compress_program, compress_program, (char *) NULL) < 0)

935 error (SORT_FAILURE, errno, _("couldn't execute %s"),

936 compress_program);

937 }

938 else

939 node->pid = 0;

940 }

941

942 *pfp = fdopen (tempfd, "w");

943 if (! *pfp)

944 die (_("couldn't create temporary file"), name);

945

946 if (ppid)

947 *ppid = node->pid;

948

949 return name;

950 }

951

952 /* Open a compressed temp file and start a decompression process through

953 which to filter the input. PID must be the valid processes ID of the

954 process used to compress the file. */

955

956 static FILE *

957 open_temp (const char *name, pid_t pid)

958 {

959 int tempfd, pipefds[2];

960 pid_t child_pid;

961 FILE *fp;

962

963 wait_proc (pid);

964

965 tempfd = open (name, O_RDONLY);

966 if (tempfd < 0)

967 die (_("couldn't open temporary file"), name);

968

969 child_pid = pipe_fork (pipefds, MAX_FORK_TRIES_DECOMPRESS);

970 if (0 < child_pid)

971 {

972 close (tempfd);

973 close (pipefds[1]);

974 }

975 else if (child_pid == 0)

976 {

977 close (pipefds[0]);

978 dup2_or_die (tempfd, STDIN_FILENO);

979 close (tempfd);

980 dup2_or_die (pipefds[1], STDOUT_FILENO);

981 close (pipefds[1]);

982

983 if (execlp (compress_program, compress_program, "-d", (char *) NULL) < 0)

984 error (SORT_FAILURE, errno, _("couldn't execute %s -d"),

985 compress_program);

986 }

987 else

988 error (SORT_FAILURE, errno, _("couldn't create process for %s -d"),

989 compress_program);

990

991 fp = fdopen (pipefds[0], "r");

992 if (! fp)

993 die (_("couldn't create temporary file"), name);

994

995 return fp;

996 }

997

998 static void

999 write_bytes (const char *buf, size_t n_bytes, FILE *fp, const char *output_file)

1000 {

1001 if (fwrite (buf, 1, n_bytes, fp) != n_bytes)

1002 die (_("write failed"), output_file);

1003 }

1004

1005 /* Append DIR to the array of temporary directory names. */

1006 static void

1007 add_temp_dir (char const *dir)

1008 {

1009 if (temp_dir_count == temp_dir_alloc)

1010 temp_dirs = X2NREALLOC (temp_dirs, &temp_dir_alloc);

1011

1012 temp_dirs[temp_dir_count++] = dir;

1013 }

1014

1015 /* Remove NAME from the list of temporary files. */

1016

1017 static void

1018 zaptemp (const char *name)

1019 {

1020 struct tempnode *volatile *pnode;

1021 struct tempnode *node;

1022 struct tempnode *next;

1023 int unlink_status;

1024 int unlink_errno = 0;

1025 struct cs_status cs;

1026

1027 for (pnode = &temphead; (node = *pnode)->name != name; pnode = &node->next)

1028 continue;

1029

1030 /* Unlink the temporary file in a critical section to avoid races. */

1031 next = node->next;

1032 cs = cs_enter ();

1033 unlink_status = unlink (name);

1034 unlink_errno = errno;

1035 *pnode = next;

1036 cs_leave (cs);

1037

1038 if (unlink_status != 0)

1039 error (0, unlink_errno, _("warning: cannot remove: %s"), name);

1040 if (! next)

1041 temptail = pnode;

1042 free (node);

1043 }

1044

1045 #if HAVE_NL_LANGINFO

1046

1047 static int

1048 struct_month_cmp (const void *m1, const void *m2)

1049 {

1050 struct month const *month1 = m1;

1051 struct month const *month2 = m2;

1052 return strcmp (month1->name, month2->name);

1053 }

1054

1055 #endif

1056

1057 /* Initialize the character class tables. */

1058

1059 static void

1060 inittables (void)

1061 {

1062 size_t i;

1063

1064 for (i = 0; i < UCHAR_LIM; ++i)

1065 {

1066 blanks[i] = !! isblank (i);

1067 nonprinting[i] = ! isprint (i);

1068 nondictionary[i] = ! isalnum (i) && ! isblank (i);

1069 fold_toupper[i] = toupper (i);

1070 }

1071

1072 #if HAVE_NL_LANGINFO

1073 /* If we're not in the "C" locale, read different names for months. */

1074 if (hard_LC_TIME)

1075 {

1076 for (i = 0; i < MONTHS_PER_YEAR; i++)

1077 {

1078 char const *s;

1079 size_t s_len;

1080 size_t j;

1081 char *name;

1082

1083 s = (char *) nl_langinfo (ABMON_1 + i);

1084 s_len = strlen (s);

1085 monthtab[i].name = name = xmalloc (s_len + 1);

1086 monthtab[i].val = i + 1;

1087

1088 for (j = 0; j < s_len; j++)

1089 name[j] = fold_toupper[to_uchar (s[j])];

1090 name[j] = '\0';

1091 }

1092 qsort ((void *) monthtab, MONTHS_PER_YEAR,

1093 sizeof *monthtab, struct_month_cmp);

1094 }

1095 #endif

1096 }

1097

1098 /* Specify how many inputs may be merged at once.

1099 This may be set on the command-line with the

1100 --batch-size option. */

1101 static void

1102 specify_nmerge (int oi, char c, char const *s)

1103 {

1104 uintmax_t n;

1105 struct rlimit rlimit;

1106 enum strtol_error e = xstrtoumax (s, NULL, 10, &n, NULL);

1107

1108 /* Try to find out how many file descriptors we'll be able

1109 to open. We need at least nmerge + 3 (STDIN_FILENO,

1110 STDOUT_FILENO and STDERR_FILENO). */

1111 unsigned int max_nmerge = ((getrlimit (RLIMIT_NOFILE, &rlimit) == 0

1112 ? rlimit.rlim_cur

1113 : OPEN_MAX)

1114 - 3);

1115

1116 if (e == LONGINT_OK)

1117 {

1118 nmerge = n;

1119 if (nmerge != n)

1120 e = LONGINT_OVERFLOW;

1121 else

1122 {

1123 if (nmerge < 2)

1124 {

1125 error (0, 0, _("invalid --%s argument %s"),

1126 long_options[oi].name, quote(s));

1127 error (SORT_FAILURE, 0,

1128 _("minimum --%s argument is %s"),

1129 long_options[oi].name, quote("2"));

1130 }

1131 else if (max_nmerge < nmerge)

1132 {

1133 e = LONGINT_OVERFLOW;

1134 }

1135 else

1136 return;

1137 }

1138 }

1139

1140 if (e == LONGINT_OVERFLOW)

1141 {

1142 char max_nmerge_buf[INT_BUFSIZE_BOUND (unsigned int)];

1143 error (0, 0, _("--%s argument %s too large"),

1144 long_options[oi].name, quote(s));

1145 error (SORT_FAILURE, 0,

1146 _("maximum --%s argument with current rlimit is %s"),

1147 long_options[oi].name,

1148 uinttostr (max_nmerge, max_nmerge_buf));

1149 }

1150 else

1151 xstrtol_fatal (e, oi, c, long_options, s);

1152 }

1153

1154 /* Specify the amount of main memory to use when sorting. */

1155 static void

1156 specify_sort_size (int oi, char c, char const *s)

1157 {

1158 uintmax_t n;

1159 char *suffix;

1160 enum strtol_error e = xstrtoumax (s, &suffix, 10, &n, "EgGkKmMPtTYZ");

1161

1162 /* The default unit is KiB. */

1163 if (e == LONGINT_OK && ISDIGIT (suffix[-1]))

1164 {

1165 if (n <= UINTMAX_MAX / 1024)

1166 n *= 1024;

1167 else

1168 e = LONGINT_OVERFLOW;

1169 }

1170

1171 /* A 'b' suffix means bytes; a '%' suffix means percent of memory. */

1172 if (e == LONGINT_INVALID_SUFFIX_CHAR && ISDIGIT (suffix[-1]) && ! suffix[1])

1173 switch (suffix[0])

1174 {

1175 case 'b':

1176 e = LONGINT_OK;

1177 break;

1178

1179 case '%':

1180 {

1181 double mem = physmem_total () * n / 100;

1182

1183 /* Use "<", not "<=", to avoid problems with rounding. */

1184 if (mem < UINTMAX_MAX)

1185 {

1186 n = mem;

1187 e = LONGINT_OK;

1188 }

1189 else

1190 e = LONGINT_OVERFLOW;

1191 }

1192 break;

1193 }

1194

1195 if (e == LONGINT_OK)

1196 {

1197 /* If multiple sort sizes are specified, take the maximum, so

1198 that option order does not matter. */

1199 if (n < sort_size)

1200 return;

1201

1202 sort_size = n;

1203 if (sort_size == n)

1204 {

1205 sort_size = MAX (sort_size, MIN_SORT_SIZE);

1206 return;

1207 }

1208

1209 e = LONGINT_OVERFLOW;

1210 }

1211

1212 xstrtol_fatal (e, oi, c, long_options, s);

1213 }

1214

1215 /* Return the default sort size. */

1216 static size_t

1217 default_sort_size (void)

1218 {

1219 /* Let MEM be available memory or 1/8 of total memory, whichever

1220 is greater. */

1221 double avail = physmem_available ();

1222 double total = physmem_total ();

1223 double mem = MAX (avail, total / 8);

1224 struct rlimit rlimit;

1225

1226 /* Let SIZE be MEM, but no more than the maximum object size or

1227 system resource limits. Avoid the MIN macro here, as it is not

1228 quite right when only one argument is floating point. Don't

1229 bother to check for values like RLIM_INFINITY since in practice

1230 they are not much less than SIZE_MAX. */

1231 size_t size = SIZE_MAX;

1232 if (mem < size)

1233 size = mem;

1234 if (getrlimit (RLIMIT_DATA, &rlimit) == 0 && rlimit.rlim_cur < size)

1235 size = rlimit.rlim_cur;

1236 #ifdef RLIMIT_AS

1237 if (getrlimit (RLIMIT_AS, &rlimit) == 0 && rlimit.rlim_cur < size)

1238 size = rlimit.rlim_cur;

1239 #endif

1240

1241 /* Leave a large safety margin for the above limits, as failure can

1242 occur when they are exceeded. */

1243 size /= 2;

1244

1245 #ifdef RLIMIT_RSS

1246 /* Leave a 1/16 margin for RSS to leave room for code, stack, etc.

1247 Exceeding RSS is not fatal, but can be quite slow. */

1248 if (getrlimit (RLIMIT_RSS, &rlimit) == 0 && rlimit.rlim_cur / 16 * 15 < size)

1249 size = rlimit.rlim_cur / 16 * 15;

1250 #endif

1251

1252 /* Use no less than the minimum. */

1253 return MAX (size, MIN_SORT_SIZE);

1254 }

1255

1256 /* Return the sort buffer size to use with the input files identified

1257 by FPS and FILES, which are alternate names of the same files.

1258 NFILES gives the number of input files; NFPS may be less. Assume

1259 that each input line requires LINE_BYTES extra bytes' worth of line

1260 information. Do not exceed the size bound specified by the user

1261 (or a default size bound, if the user does not specify one). */

1262

1263 static size_t

1264 sort_buffer_size (FILE *const *fps, size_t nfps,

1265 char *const *files, size_t nfiles,

1266 size_t line_bytes)

1267 {

1268 /* A bound on the input size. If zero, the bound hasn't been

1269 determined yet. */

1270 static size_t size_bound;

1271

1272 /* In the worst case, each input byte is a newline. */

1273 size_t worst_case_per_input_byte = line_bytes + 1;

1274

1275 /* Keep enough room for one extra input line and an extra byte.

1276 This extra room might be needed when preparing to read EOF. */

1277 size_t size = worst_case_per_input_byte + 1;

1278

1279 size_t i;

1280

1281 for (i = 0; i < nfiles; i++)

1282 {

1283 struct stat st;

1284 off_t file_size;

1285 size_t worst_case;

1286

1287 if ((i < nfps ? fstat (fileno (fps[i]), &st)

1288 : STREQ (files[i], "-") ? fstat (STDIN_FILENO, &st)

1289 : stat (files[i], &st))

1290 != 0)

1291 die (_("stat failed"), files[i]);

1292

1293 if (S_ISREG (st.st_mode))

1294 file_size = st.st_size;

1295 else

1296 {

1297 /* The file has unknown size. If the user specified a sort

1298 buffer size, use that; otherwise, guess the size. */

1299 if (sort_size)

1300 return sort_size;

1301 file_size = INPUT_FILE_SIZE_GUESS;

1302 }

1303

1304 if (! size_bound)

1305 {

1306 size_bound = sort_size;

1307 if (! size_bound)

1308 size_bound = default_sort_size ();

1309 }

1310

1311 /* Add the amount of memory needed to represent the worst case

1312 where the input consists entirely of newlines followed by a

1313 single non-newline. Check for overflow. */

1314 worst_case = file_size * worst_case_per_input_byte + 1;

1315 if (file_size != worst_case / worst_case_per_input_byte

1316 || size_bound - size <= worst_case)

1317 return size_bound;

1318 size += worst_case;

1319 }

1320

1321 return size;

1322 }

1323

1324 /* Initialize BUF. Reserve LINE_BYTES bytes for each line; LINE_BYTES

1325 must be at least sizeof (struct line). Allocate ALLOC bytes

1326 initially. */

1327

1328 static void

1329 initbuf (struct buffer *buf, size_t line_bytes, size_t alloc)

1330 {

1331 /* Ensure that the line array is properly aligned. If the desired

1332 size cannot be allocated, repeatedly halve it until allocation

1333 succeeds. The smaller allocation may hurt overall performance,

1334 but that's better than failing. */

1335 for (;;)

1336 {

1337 alloc += sizeof (struct line) - alloc % sizeof (struct line);

1338 buf->buf = malloc (alloc);

1339 if (buf->buf)

1340 break;

1341 alloc /= 2;

1342 if (alloc <= line_bytes + 1)

1343 xalloc_die ();

1344 }

1345

1346 buf->line_bytes = line_bytes;

1347 buf->alloc = alloc;

1348 buf->used = buf->left = buf->nlines = 0;

1349 buf->eof = false;

1350 }

1351

1352 /* Return one past the limit of the line array. */

1353

1354 static inline struct line *

1355 buffer_linelim (struct buffer const *buf)

1356 {

1357 return (struct line *) (buf->buf + buf->alloc);

1358 }

1359

1360 /* Return a pointer to the first character of the field specified

1361 by KEY in LINE. */

1362

1363 static char *

1364 begfield (const struct line *line, const struct keyfield *key)

1365 {

1366 char *ptr = line->text, *lim = ptr + line->length - 1;

1367 size_t sword = key->sword;

1368 size_t schar = key->schar;

1369 size_t remaining_bytes;

1370

1371 /* The leading field separator itself is included in a field when -t

1372 is absent. */

1373

1374 if (tab != TAB_DEFAULT)

1375 while (ptr < lim && sword--)

1376 {

1377 while (ptr < lim && *ptr != tab)

1378 ++ptr;

1379 if (ptr < lim)

1380 ++ptr;

1381 }

1382 else

1383 while (ptr < lim && sword--)

1384 {

1385 while (ptr < lim && blanks[to_uchar (*ptr)])

1386 ++ptr;

1387 while (ptr < lim && !blanks[to_uchar (*ptr)])

1388 ++ptr;

1389 }

1390

1391 if (key->skipsblanks)

1392 while (ptr < lim && blanks[to_uchar (*ptr)])

1393 ++ptr;

1394

1395 /* Advance PTR by SCHAR (if possible), but no further than LIM. */

1396 remaining_bytes = lim - ptr;

1397 if (schar < remaining_bytes)

1398 ptr += schar;

1399 else

1400 ptr = lim;

1401

1402 return ptr;

1403 }

1404

1405 /* Return the limit of (a pointer to the first character after) the field

1406 in LINE specified by KEY. */

1407

1408 static char *

1409 limfield (const struct line *line, const struct keyfield *key)

1410 {

1411 char *ptr = line->text, *lim = ptr + line->length - 1;

1412 size_t eword = key->eword, echar = key->echar;

1413 size_t remaining_bytes;

1414

1415 /* Move PTR past EWORD fields or to one past the last byte on LINE,

1416 whichever comes first. If there are more than EWORD fields, leave

1417 PTR pointing at the beginning of the field having zero-based index,

1418 EWORD. If a delimiter character was specified (via -t), then that

1419 `beginning' is the first character following the delimiting TAB.

1420 Otherwise, leave PTR pointing at the first `blank' character after

1421 the preceding field. */

1422 if (tab != TAB_DEFAULT)

1423 while (ptr < lim && eword--)

1424 {

1425 while (ptr < lim && *ptr != tab)

1426 ++ptr;

1427 if (ptr < lim && (eword | echar))

1428 ++ptr;

1429 }

1430 else

1431 while (ptr < lim && eword--)

1432 {

1433 while (ptr < lim && blanks[to_uchar (*ptr)])

1434 ++ptr;

1435 while (ptr < lim && !blanks[to_uchar (*ptr)])

1436 ++ptr;

1437 }

1438

1439 #ifdef POSIX_UNSPECIFIED

1440 /* The following block of code makes GNU sort incompatible with

1441 standard Unix sort, so it's ifdef'd out for now.

1442 The POSIX spec isn't clear on how to interpret this.

1443 FIXME: request clarification.

1444

1445 From: kwzh@gnu.ai.mit.edu (Karl Heuer)

1446 Date: Thu, 30 May 96 12:20:41 -0400

1447 [Translated to POSIX 1003.1-2001 terminology by Paul Eggert.]

1448

1449 [...]I believe I've found another bug in `sort'.

1450

1451 $ cat /tmp/sort.in

1452 a b c 2 d

1453 pq rs 1 t

1454 $ textutils-1.15/src/sort -k1.7,1.7 </tmp/sort.in

1455 a b c 2 d

1456 pq rs 1 t

1457 $ /bin/sort -k1.7,1.7 </tmp/sort.in

1458 pq rs 1 t

1459 a b c 2 d

1460

1461 Unix sort produced the answer I expected: sort on the single character

1462 in column 7. GNU sort produced different results, because it disagrees

1463 on the interpretation of the key-end spec "M.N". Unix sort reads this

1464 as "skip M-1 fields, then N-1 characters"; but GNU sort wants it to mean

1465 "skip M-1 fields, then either N-1 characters or the rest of the current

1466 field, whichever comes first". This extra clause applies only to

1467 key-ends, not key-starts.

1468 */

1469

1470 /* Make LIM point to the end of (one byte past) the current field. */

1471 if (tab != TAB_DEFAULT)

1472 {

1473 char *newlim;

1474 newlim = memchr (ptr, tab, lim - ptr);

1475 if (newlim)

1476 lim = newlim;

1477 }

1478 else

1479 {

1480 char *newlim;

1481 newlim = ptr;

1482 while (newlim < lim && blanks[to_uchar (*newlim)])

1483 ++newlim;

1484 while (newlim < lim && !blanks[to_uchar (*newlim)])

1485 ++newlim;

1486 lim = newlim;

1487 }

1488 #endif

1489

1490 /* If we're ignoring leading blanks when computing the End

1491 of the field, don't start counting bytes until after skipping

1492 past any leading blanks. */

1493 if (key->skipeblanks)

1494 while (ptr < lim && blanks[to_uchar (*ptr)])

1495 ++ptr;

1496

1497 /* Advance PTR by ECHAR (if possible), but no further than LIM. */

1498 remaining_bytes = lim - ptr;

1499 if (echar < remaining_bytes)

1500 ptr += echar;

1501 else

1502 ptr = lim;

1503

1504 return ptr;

1505 }

1506

1507 /* Fill BUF reading from FP, moving buf->left bytes from the end

1508 of buf->buf to the beginning first. If EOF is reached and the

1509 file wasn't terminated by a newline, supply one. Set up BUF's line

1510 table too. FILE is the name of the file corresponding to FP.

1511 Return true if some input was read. */

1512

1513 static bool

1514 fillbuf (struct buffer *buf, FILE *fp, char const *file)

1515 {

1516 struct keyfield const *key = keylist;

1517 char eol = eolchar;

1518 size_t line_bytes = buf->line_bytes;

1519 size_t mergesize = merge_buffer_size - MIN_MERGE_BUFFER_SIZE;

1520

1521 if (buf->eof)

1522 return false;

1523

1524 if (buf->used != buf->left)

1525 {

1526 memmove (buf->buf, buf->buf + buf->used - buf->left, buf->left);

1527 buf->used = buf->left;

1528 buf->nlines = 0;

1529 }

1530

1531 for (;;)

1532 {

1533 char *ptr = buf->buf + buf->used;

1534 struct line *linelim = buffer_linelim (buf);

1535 struct line *line = linelim - buf->nlines;

1536 size_t avail = (char *) linelim - buf->nlines * line_bytes - ptr;

1537 char *line_start = buf->nlines ? line->text + line->length : buf->buf;

1538

1539 while (line_bytes + 1 < avail)

1540 {

1541 /* Read as many bytes as possible, but do not read so many

1542 bytes that there might not be enough room for the

1543 corresponding line array. The worst case is when the

1544 rest of the input file consists entirely of newlines,

1545 except that the last byte is not a newline. */

1546 size_t readsize = (avail - 1) / (line_bytes + 1);

1547 size_t bytes_read = fread (ptr, 1, readsize, fp);

1548 char *ptrlim = ptr + bytes_read;

1549 char *p;

1550 avail -= bytes_read;

1551

1552 if (bytes_read != readsize)

1553 {

1554 if (ferror (fp))

1555 die (_("read failed"), file);

1556 if (feof (fp))

1557 {

1558 buf->eof = true;

1559 if (buf->buf == ptrlim)

1560 return false;

1561 if (ptrlim[-1] != eol)

1562 *ptrlim++ = eol;

1563 }

1564 }

1565

1566 /* Find and record each line in the just-read input. */

1567 while ((p = memchr (ptr, eol, ptrlim - ptr)))

1568 {

1569 ptr = p + 1;

1570 line--;

1571 line->text = line_start;

1572 line->length = ptr - line_start;

1573 mergesize = MAX (mergesize, line->length);

1574 avail -= line_bytes;

1575

1576 if (key)

1577 {

1578 /* Precompute the position of the first key for

1579 efficiency. */

1580 line->keylim = (key->eword == SIZE_MAX

1581 ? p

1582 : limfield (line, key));

1583

1584 if (key->sword != SIZE_MAX)

1585 line->keybeg = begfield (line, key);

1586 else

1587 {

1588 if (key->skipsblanks)

1589 while (blanks[to_uchar (*line_start)])

1590 line_start++;

1591 line->keybeg = line_start;

1592 }

1593 }

1594

1595 line_start = ptr;

1596 }

1597

1598 ptr = ptrlim;

1599 if (buf->eof)

1600 break;

1601 }

1602

1603 buf->used = ptr - buf->buf;

1604 buf->nlines = buffer_linelim (buf) - line;

1605 if (buf->nlines != 0)

1606 {

1607 buf->left = ptr - line_start;

1608 merge_buffer_size = mergesize + MIN_MERGE_BUFFER_SIZE;

1609 return true;

1610 }

1611

1612 {

1613 /* The current input line is too long to fit in the buffer.

1614 Double the buffer size and try again, keeping it properly

1615 aligned. */

1616 size_t line_alloc = buf->alloc / sizeof (struct line);

1617 buf->buf = x2nrealloc (buf->buf, &line_alloc, sizeof (struct line));

1618 buf->alloc = line_alloc * sizeof (struct line);

1619 }

1620 }

1621 }

1622

1623 /* Compare strings A and B as numbers without explicitly converting them to

1624 machine numbers. Comparatively slow for short strings, but asymptotically

1625 hideously fast. */

1626

1627 static int

1628 numcompare (const char *a, const char *b)

1629 {

1630 while (blanks[to_uchar (*a)])

1631 a++;

1632 while (blanks[to_uchar (*b)])

1633 b++;

1634

1635 return strnumcmp (a, b, decimal_point, thousands_sep);

1636 }

1637

1638 static int

1639 general_numcompare (const char *sa, const char *sb)

1640 {

1641 /* FIXME: add option to warn about failed conversions. */

1642 /* FIXME: maybe add option to try expensive FP conversion

1643 only if A and B can't be compared more cheaply/accurately. */

1644

1645 char *ea;

1646 char *eb;

1647 double a = strtod (sa, &ea);

1648 double b = strtod (sb, &eb);

1649

1650 /* Put conversion errors at the start of the collating sequence. */

1651 if (sa == ea)

1652 return sb == eb ? 0 : -1;

1653 if (sb == eb)

1654 return 1;

1655

1656 /* Sort numbers in the usual way, where -0 == +0. Put NaNs after

1657 conversion errors but before numbers; sort them by internal

1658 bit-pattern, for lack of a more portable alternative. */

1659 return (a < b ? -1

1660 : a > b ? 1

1661 : a == b ? 0

1662 : b == b ? -1

1663 : a == a ? 1

1664 : memcmp ((char *) &a, (char *) &b, sizeof a));

1665 }

1666

1667 /* Return an integer in 1..12 of the month name MONTH with length LEN.

1668 Return 0 if the name in S is not recognized. */

1669

1670 static int

1671 getmonth (char const *month, size_t len)

1672 {

1673 size_t lo = 0;

1674 size_t hi = MONTHS_PER_YEAR;

1675 char const *monthlim = month + len;

1676

1677 for (;;)

1678 {

1679 if (month == monthlim)

1680 return 0;

1681 if (!blanks[to_uchar (*month)])

1682 break;

1683 ++month;

1684 }

1685

1686 do

1687 {

1688 size_t ix = (lo + hi) / 2;

1689 char const *m = month;

1690 char const *n = monthtab[ix].name;

1691

1692 for (;; m++, n++)

1693 {

1694 if (!*n)

1695 return monthtab[ix].val;

1696 if (m == monthlim || fold_toupper[to_uchar (*m)] < to_uchar (*n))

1697 {

1698 hi = ix;

1699 break;

1700 }

1701 else if (fold_toupper[to_uchar (*m)] > to_uchar (*n))

1702 {

1703 lo = ix + 1;

1704 break;

1705 }

1706 }

1707 }

1708 while (lo < hi);

1709

1710 return 0;

1711 }

1712

1713 /* A source of random data. */

1714 static struct randread_source *randread_source;

1715

1716 /* Return the Ith randomly-generated state. The caller must invoke

1717 random_state (H) for all H less than I before invoking random_state

1718 (I). */

1719

1720 static struct md5_ctx

1721 random_state (size_t i)

1722 {

1723 /* An array of states resulting from the random data, and counts of

1724 its used and allocated members. */

1725 static struct md5_ctx *state;

1726 static size_t used;

1727 static size_t allocated;

1728

1729 struct md5_ctx *s = &state[i];

1730

1731 if (used <= i)

1732 {

1733 unsigned char buf[MD5_DIGEST_SIZE];

1734

1735 used++;

1736

1737 if (allocated <= i)

1738 {

1739 state = X2NREALLOC (state, &allocated);

1740 s = &state[i];

1741 }

1742

1743 randread (randread_source, buf, sizeof buf);

1744 md5_init_ctx (s);

1745 md5_process_bytes (buf, sizeof buf, s);

1746 }

1747

1748 return *s;

1749 }

1750

1751 /* Compare the hashes of TEXTA with length LENGTHA to those of TEXTB

1752 with length LENGTHB. Return negative if less, zero if equal,

1753 positive if greater. */

1754

1755 static int

1756 cmp_hashes (char const *texta, size_t lena,

1757 char const *textb, size_t lenb)

1758 {

1759 /* Try random hashes until a pair of hashes disagree. But if the

1760 first pair of random hashes agree, check whether the keys are

1761 identical and if so report no difference. */

1762 int diff;

1763 size_t i;

1764 for (i = 0; ; i++)

1765 {

1766 uint32_t dig[2][MD5_DIGEST_SIZE / sizeof (uint32_t)];

1767 struct md5_ctx s[2];

1768 s[0] = s[1] = random_state (i);

1769 md5_process_bytes (texta, lena, &s[0]); md5_finish_ctx (&s[0], dig[0]);

1770 md5_process_bytes (textb, lenb, &s[1]); md5_finish_ctx (&s[1], dig[1]);

1771 diff = memcmp (dig[0], dig[1], sizeof dig[0]);

1772 if (diff != 0)

1773 break;

1774 if (i == 0 && lena == lenb && memcmp (texta, textb, lena) == 0)

1775 break;

1776 }

1777

1778 return diff;

1779 }

1780

1781 /* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB)

1782 using one or more random hash functions. */

1783

1784 static int

1785 compare_random (char *restrict texta, size_t lena,

1786 char *restrict textb, size_t lenb)

1787 {

1788 int diff;

1789

1790 if (! hard_LC_COLLATE)

1791 diff = cmp_hashes (texta, lena, textb, lenb);

1792 else

1793 {

1794 /* Transform the text into the basis of comparison, so that byte

1795 strings that would otherwise considered to be equal are

1796 considered equal here even if their bytes differ. */

1797

1798 char *buf = NULL;

1799 char stackbuf[4000];

1800 size_t tlena = xmemxfrm (stackbuf, sizeof stackbuf, texta, lena);

1801 bool a_fits = tlena <= sizeof stackbuf;

1802 size_t tlenb = xmemxfrm ((a_fits ? stackbuf + tlena : NULL),

1803 (a_fits ? sizeof stackbuf - tlena : 0),

1804 textb, lenb);

1805

1806 if (a_fits && tlena + tlenb <= sizeof stackbuf)

1807 buf = stackbuf;

1808 else

1809 {

1810 /* Adding 1 to the buffer size lets xmemxfrm run a bit

1811 faster by avoiding the need for an extra buffer copy. */

1812 buf = xmalloc (tlena + tlenb + 1);

1813 xmemxfrm (buf, tlena + 1, texta, lena);

1814 xmemxfrm (buf + tlena, tlenb + 1, textb, lenb);

1815 }

1816

1817 diff = cmp_hashes (buf, tlena, buf + tlena, tlenb);

1818

1819 if (buf != stackbuf)

1820 free (buf);

1821 }

1822

1823 return diff;

1824 }

1825

1826 /* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB)

1827 using filevercmp. See lib/filevercmp.h for function description. */

1828

1829 static int

1830 compare_version (char *restrict texta, size_t lena,

1831 char *restrict textb, size_t lenb)

1832 {

1833 int diff;

1834

1835 /* It is necessary to save the character after the end of the field.

1836 "filevercmp" works with NUL terminated strings. Our blocks of

1837 text are not necessarily terminated with a NUL byte. */

1838 char sv_a = texta[lena];

1839 char sv_b = textb[lenb];

1840

1841 texta[lena] = '\0';

1842 textb[lenb] = '\0';

1843

1844 diff = filevercmp (texta, textb);

1845

1846 texta[lena] = sv_a;

1847 textb[lenb] = sv_b;

1848

1849 return diff;

1850 }

1851

1852 /* Compare two lines A and B trying every key in sequence until there

1853 are no more keys or a difference is found. */

1854

1855 static int

1856 keycompare (const struct line *a, const struct line *b)

1857 {

1858 struct keyfield const *key = keylist;

1859

1860 /* For the first iteration only, the key positions have been

1861 precomputed for us. */

1862 char *texta = a->keybeg;

1863 char *textb = b->keybeg;

1864 char *lima = a->keylim;

1865 char *limb = b->keylim;

1866

1867 int diff;

1868

1869 for (;;)

1870 {

1871 char const *translate = key->translate;

1872 bool const *ignore = key->ignore;

1873

1874 /* Find the lengths. */

1875 size_t lena = lima <= texta ? 0 : lima - texta;

1876 size_t lenb = limb <= textb ? 0 : limb - textb;

1877

1878 /* Actually compare the fields. */

1879

1880 if (key->random)

1881 diff = compare_random (texta, lena, textb, lenb);

1882 else if (key->numeric | key->general_numeric)

1883 {

1884 char savea = *lima, saveb = *limb;

1885

1886 *lima = *limb = '\0';

1887 diff = ((key->numeric ? numcompare : general_numcompare)

1888 (texta, textb));

1889 *lima = savea, *limb = saveb;

1890 }

1891 else if (key->version)

1892 diff = compare_version (texta, lena, textb, lenb);

1893 else if (key->month)

1894 diff = getmonth (texta, lena) - getmonth (textb, lenb);

1895 /* Sorting like this may become slow, so in a simple locale the user

1896 can select a faster sort that is similar to ascii sort. */

1897 else if (hard_LC_COLLATE)

1898 {

1899 if (ignore || translate)

1900 {

1901 char buf[4000];

1902 size_t size = lena + 1 + lenb + 1;

1903 char *copy_a = (size <= sizeof buf ? buf : xmalloc (size));

1904 char *copy_b = copy_a + lena + 1;

1905 size_t new_len_a, new_len_b, i;

1906

1907 /* Ignore and/or translate chars before comparing. */

1908 for (new_len_a = new_len_b = i = 0; i < MAX (lena, lenb); i++)

1909 {

1910 if (i < lena)

1911 {

1912 copy_a[new_len_a] = (translate

1913 ? translate[to_uchar (texta[i])]

1914 : texta[i]);

1915 if (!ignore || !ignore[to_uchar (texta[i])])

1916 ++new_len_a;

1917 }

1918 if (i < lenb)

1919 {

1920 copy_b[new_len_b] = (translate

1921 ? translate[to_uchar (textb[i])]

1922 : textb [i]);

1923 if (!ignore || !ignore[to_uchar (textb[i])])

1924 ++new_len_b;

1925 }

1926 }

1927

1928 diff = xmemcoll (copy_a, new_len_a, copy_b, new_len_b);

1929

1930 if (sizeof buf < size)

1931 free (copy_a);

1932 }

1933 else if (lena == 0)

1934 diff = - NONZERO (lenb);

1935 else if (lenb == 0)

1936 goto greater;

1937 else

1938 diff = xmemcoll (texta, lena, textb, lenb);

1939 }

1940 else if (ignore)

1941 {

1942 #define CMP_WITH_IGNORE(A, B) \

1943 do \

1944 { \

1945 for (;;) \

1946 { \

1947 while (texta < lima && ignore[to_uchar (*texta)]) \

1948 ++texta; \

1949 while (textb < limb && ignore[to_uchar (*textb)]) \

1950 ++textb; \

1951 if (! (texta < lima && textb < limb)) \

1952 break; \

1953 diff = to_uchar (A) - to_uchar (B); \

1954 if (diff) \

1955 goto not_equal; \

1956 ++texta; \

1957 ++textb; \

1958 } \

1959 \

1960 diff = (texta < lima) - (textb < limb); \

1961 } \

1962 while (0)

1963

1964 if (translate)

1965 CMP_WITH_IGNORE (translate[to_uchar (*texta)],

1966 translate[to_uchar (*textb)]);

1967 else

1968 CMP_WITH_IGNORE (*texta, *textb);

1969 }

1970 else if (lena == 0)

1971 diff = - NONZERO (lenb);

1972 else if (lenb == 0)

1973 goto greater;

1974 else

1975 {

1976 if (translate)

1977 {

1978 while (texta < lima && textb < limb)

1979 {

1980 diff = (to_uchar (translate[to_uchar (*texta++)])

1981 - to_uchar (translate[to_uchar (*textb++)]));

1982 if (diff)

1983 goto not_equal;

1984 }

1985 }

1986 else

1987 {

1988 diff = memcmp (texta, textb, MIN (lena, lenb));

1989 if (diff)

1990 goto not_equal;

1991 }

1992 diff = lena < lenb ? -1 : lena != lenb;

1993 }

1994

1995 if (diff)

1996 goto not_equal;

1997

1998 key = key->next;

1999 if (! key)

2000 break;

2001

2002 /* Find the beginning and limit of the next field. */

2003 if (key->eword != SIZE_MAX)

2004 lima = limfield (a, key), limb = limfield (b, key);

2005 else

2006 lima = a->text + a->length - 1, limb = b->text + b->length - 1;

2007

2008 if (key->sword != SIZE_MAX)

2009 texta = begfield (a, key), textb = begfield (b, key);

2010 else

2011 {

2012 texta = a->text, textb = b->text;

2013 if (key->skipsblanks)

2014 {

2015 while (texta < lima && blanks[to_uchar (*texta)])

2016 ++texta;

2017 while (textb < limb && blanks[to_uchar (*textb)])

2018 ++textb;

2019 }

2020 }

2021 }

2022

2023 return 0;

2024

2025 greater:

2026 diff = 1;

2027 not_equal:

2028 return key->reverse ? -diff : diff;

2029 }

2030

2031 /* Compare two lines A and B, returning negative, zero, or positive

2032 depending on whether A compares less than, equal to, or greater than B. */

2033

2034 static int

2035 compare (const struct line *a, const struct line *b)

2036 {

2037 int diff;

2038 size_t alen, blen;

2039

2040 /* First try to compare on the specified keys (if any).

2041 The only two cases with no key at all are unadorned sort,

2042 and unadorned sort -r. */

2043 if (keylist)

2044 {

2045 diff = keycompare (a, b);

2046 if (diff | unique | stable)

2047 return diff;

2048 }

2049

2050 /* If the keys all compare equal (or no keys were specified)

2051 fall through to the default comparison. */

2052 alen = a->length - 1, blen = b->length - 1;

2053

2054 if (alen == 0)

2055 diff = - NONZERO (blen);

2056 else if (blen == 0)

2057 diff = 1;

2058 else if (hard_LC_COLLATE)

2059 diff = xmemcoll (a->text, alen, b->text, blen);

2060 else if (! (diff = memcmp (a->text, b->text, MIN (alen, blen))))

2061 diff = alen < blen ? -1 : alen != blen;

2062

2063 return reverse ? -diff : diff;

2064 }

2065

2066 /* Check that the lines read from FILE_NAME come in order. Return

2067 true if they are in order. If CHECKONLY == 'c', also print a

2068 diagnostic (FILE_NAME, line number, contents of line) to stderr if

2069 they are not in order. */

2070

2071 static bool

2072 check (char const *file_name, char checkonly)

2073 {

2074 FILE *fp = xfopen (file_name, "r");

2075 struct buffer buf; /* Input buffer. */

2076 struct line temp; /* Copy of previous line. */

2077 size_t alloc = 0;

2078 uintmax_t line_number = 0;

2079 struct keyfield const *key = keylist;

2080 bool nonunique = ! unique;

2081 bool ordered = true;

2082

2083 initbuf (&buf, sizeof (struct line),

2084 MAX (merge_buffer_size, sort_size));

2085 temp.text = NULL;

2086

2087 while (fillbuf (&buf, fp, file_name))

2088 {

2089 struct line const *line = buffer_linelim (&buf);

2090 struct line const *linebase = line - buf.nlines;

2091

2092 /* Make sure the line saved from the old buffer contents is

2093 less than or equal to the first line of the new buffer. */

2094 if (alloc && nonunique <= compare (&temp, line - 1))

2095 {

2096 found_disorder:

2097 {

2098 if (checkonly == 'c')

2099 {

2100 struct line const *disorder_line = line - 1;

2101 uintmax_t disorder_line_number =

2102 buffer_linelim (&buf) - disorder_line + line_number;

2103 char hr_buf[INT_BUFSIZE_BOUND (uintmax_t)];

2104 fprintf (stderr, _("%s: %s:%s: disorder: "),

2105 program_name, file_name,

2106 umaxtostr (disorder_line_number, hr_buf));

2107 write_bytes (disorder_line->text, disorder_line->length,

2108 stderr, _("standard error"));

2109 }

2110

2111 ordered = false;

2112 break;

2113 }

2114 }

2115

2116 /* Compare each line in the buffer with its successor. */

2117 while (linebase < --line)

2118 if (nonunique <= compare (line, line - 1))

2119 goto found_disorder;

2120

2121 line_number += buf.nlines;

2122

2123 /* Save the last line of the buffer. */

2124 if (alloc < line->length)

2125 {

2126 do

2127 {

2128 alloc *= 2;

2129 if (! alloc)

2130 {

2131 alloc = line->length;

2132 break;

2133 }

2134 }

2135 while (alloc < line->length);

2136

2137 temp.text = xrealloc (temp.text, alloc);

2138 }

2139 memcpy (temp.text, line->text, line->length);

2140 temp.length = line->length;

2141 if (key)

2142 {

2143 temp.keybeg = temp.text + (line->keybeg - line->text);

2144 temp.keylim = temp.text + (line->keylim - line->text);

2145 }

2146 }

2147

2148 xfclose (fp, file_name);

2149 free (buf.buf);

2150 free (temp.text);

2151 return ordered;

2152 }

2153

2154 /* Merge lines from FILES onto OFP. NTEMPS is the number of temporary

2155 files (all of which are at the start of the FILES array), and

2156 NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.

2157 Close input and output files before returning.

2158 OUTPUT_FILE gives the name of the output file. If it is NULL,

2159 the output file is standard output. If OFP is NULL, the output

2160 file has not been opened yet (or written to, if standard output). */

2161

2162 static void

2163 mergefps (struct sortfile *files, size_t ntemps, size_t nfiles,

2164 FILE *ofp, char const *output_file)

2165 {

2166 FILE **fps = xnmalloc (nmerge, sizeof *fps);

2167 /* Input streams for each file. */

2168 struct buffer *buffer = xnmalloc (nmerge, sizeof *buffer);

2169 /* Input buffers for each file. */

2170 struct line saved; /* Saved line storage for unique check. */

2171 struct line const *savedline = NULL;

2172 /* &saved if there is a saved line. */

2173 size_t savealloc = 0; /* Size allocated for the saved line. */

2174 struct line const **cur = xnmalloc (nmerge, sizeof *cur);

2175 /* Current line in each line table. */

2176 struct line const **base = xnmalloc (nmerge, sizeof *base);

2177 /* Base of each line table. */

2178 size_t *ord = xnmalloc (nmerge, sizeof *ord);

2179 /* Table representing a permutation of fps,

2180 such that cur[ord[0]] is the smallest line

2181 and will be next output. */

2182 size_t i;

2183 size_t j;

2184 size_t t;

2185 struct keyfield const *key = keylist;

2186 saved.text = NULL;

2187

2188 /* Read initial lines from each input file. */

2189 for (i = 0; i < nfiles; )

2190 {

2191 fps[i] = (files[i].pid

2192 ? open_temp (files[i].name, files[i].pid)

2193 : xfopen (files[i].name, "r"));

2194 initbuf (&buffer[i], sizeof (struct line),

2195 MAX (merge_buffer_size, sort_size / nfiles));

2196 if (fillbuf (&buffer[i], fps[i], files[i].name))

2197 {

2198 struct line const *linelim = buffer_linelim (&buffer[i]);

2199 cur[i] = linelim - 1;

2200 base[i] = linelim - buffer[i].nlines;

2201 i++;

2202 }

2203 else

2204 {

2205 /* fps[i] is empty; eliminate it from future consideration. */

2206 xfclose (fps[i], files[i].name);

2207 if (i < ntemps)

2208 {

2209 ntemps--;

2210 zaptemp (files[i].name);

2211 }

2212 free (buffer[i].buf);

2213 --nfiles;

2214 for (j = i; j < nfiles; ++j)

2215 files[j] = files[j + 1];

2216 }

2217 }

2218

2219 if (! ofp)

2220 ofp = xfopen (output_file, "w");

2221

2222 /* Set up the ord table according to comparisons among input lines.

2223 Since this only reorders two items if one is strictly greater than

2224 the other, it is stable. */

2225 for (i = 0; i < nfiles; ++i)

2226 ord[i] = i;

2227 for (i = 1; i < nfiles; ++i)

2228 if (0 < compare (cur[ord[i - 1]], cur[ord[i]]))

2229 t = ord[i - 1], ord[i - 1] = ord[i], ord[i] = t, i = 0;

2230

2231 /* Repeatedly output the smallest line until no input remains. */

2232 while (nfiles)

2233 {

2234 struct line const *smallest = cur[ord[0]];

2235

2236 /* If uniquified output is turned on, output only the first of

2237 an identical series of lines. */

2238 if (unique)

2239 {

2240 if (savedline && compare (savedline, smallest))

2241 {

2242 savedline = NULL;

2243 write_bytes (saved.text, saved.length, ofp, output_file);

2244 }

2245 if (!savedline)

2246 {

2247 savedline = &saved;

2248 if (savealloc < smallest->length)

2249 {

2250 do

2251 if (! savealloc)

2252 {

2253 savealloc = smallest->length;

2254 break;

2255 }

2256 while ((savealloc *= 2) < smallest->length);

2257

2258 saved.text = xrealloc (saved.text, savealloc);

2259 }

2260 saved.length = smallest->length;

2261 memcpy (saved.text, smallest->text, saved.length);

2262 if (key)

2263 {

2264 saved.keybeg =

2265 saved.text + (smallest->keybeg - smallest->text);

2266 saved.keylim =

2267 saved.text + (smallest->keylim - smallest->text);

2268 }

2269 }

2270 }

2271 else

2272 write_bytes (smallest->text, smallest->length, ofp, output_file);

2273

2274 /* Check if we need to read more lines into core. */

2275 if (base[ord[0]] < smallest)

2276 cur[ord[0]] = smallest - 1;

2277 else

2278 {

2279 if (fillbuf (&buffer[ord[0]], fps[ord[0]], files[ord[0]].name))

2280 {

2281 struct line const *linelim = buffer_linelim (&buffer[ord[0]]);

2282 cur[ord[0]] = linelim - 1;

2283 base[ord[0]] = linelim - buffer[ord[0]].nlines;

2284 }

2285 else

2286 {

2287 /* We reached EOF on fps[ord[0]]. */

2288 for (i = 1; i < nfiles; ++i)

2289 if (ord[i] > ord[0])

2290 --ord[i];

2291 --nfiles;

2292 xfclose (fps[ord[0]], files[ord[0]].name);

2293 if (ord[0] < ntemps)

2294 {

2295 ntemps--;

2296 zaptemp (files[ord[0]].name);

2297 }

2298 free (buffer[ord[0]].buf);

2299 for (i = ord[0]; i < nfiles; ++i)

2300 {

2301 fps[i] = fps[i + 1];

2302 files[i] = files[i + 1];

2303 buffer[i] = buffer[i + 1];

2304 cur[i] = cur[i + 1];

2305 base[i] = base[i + 1];

2306 }

2307 for (i = 0; i < nfiles; ++i)

2308 ord[i] = ord[i + 1];

2309 continue;

2310 }

2311 }

2312

2313 /* The new line just read in may be larger than other lines

2314 already in main memory; push it back in the queue until we

2315 encounter a line larger than it. Optimize for the common

2316 case where the new line is smallest. */

2317 {

2318 size_t lo = 1;

2319 size_t hi = nfiles;

2320 size_t probe = lo;

2321 size_t ord0 = ord[0];

2322 size_t count_of_smaller_lines;

2323

2324 while (lo < hi)

2325 {

2326 int cmp = compare (cur[ord0], cur[ord[probe]]);

2327 if (cmp < 0 || (cmp == 0 && ord0 < ord[probe]))

2328 hi = probe;

2329 else

2330 lo = probe + 1;

2331 probe = (lo + hi) / 2;

2332 }

2333

2334 count_of_smaller_lines = lo - 1;

2335 for (j = 0; j < count_of_smaller_lines; j++)

2336 ord[j] = ord[j + 1];

2337 ord[count_of_smaller_lines] = ord0;

2338 }

2339

2340 /* Free up some resources every once in a while. */

2341 if (MAX_PROCS_BEFORE_REAP < nprocs)

2342 reap_some ();

2343 }

2344

2345 if (unique && savedline)

2346 {

2347 write_bytes (saved.text, saved.length, ofp, output_file);

2348 free (saved.text);

2349 }

2350

2351 xfclose (ofp, output_file);

2352 free(fps);

2353 free(buffer);

2354 free(ord);

2355 free(base);

2356 free(cur);

2357 }

2358

2359 /* Merge into T the two sorted arrays of lines LO (with NLO members)

2360 and HI (with NHI members). T, LO, and HI point just past their

2361 respective arrays, and the arrays are in reverse order. NLO and

2362 NHI must be positive, and HI - NHI must equal T - (NLO + NHI). */

2363

2364 static inline void

2365 mergelines (struct line *t,

2366 struct line const *lo, size_t nlo,

2367 struct line const *hi, size_t nhi)

2368 {

2369 for (;;)

2370 if (compare (lo - 1, hi - 1) <= 0)

2371 {

2372 *--t = *--lo;

2373 if (! --nlo)

2374 {

2375 /* HI - NHI equalled T - (NLO + NHI) when this function

2376 began. Therefore HI must equal T now, and there is no

2377 need to copy from HI to T. */

2378 return;

2379 }

2380 }

2381 else

2382 {

2383 *--t = *--hi;

2384 if (! --nhi)

2385 {

2386 do

2387 *--t = *--lo;

2388 while (--nlo);

2389

2390 return;

2391 }

2392 }

2393 }

2394

2395 /* Sort the array LINES with NLINES members, using TEMP for temporary space.

2396 NLINES must be at least 2.

2397 The input and output arrays are in reverse order, and LINES and

2398 TEMP point just past the end of their respective arrays.

2399

2400 Use a recursive divide-and-conquer algorithm, in the style

2401 suggested by Knuth volume 3 (2nd edition), exercise 5.2.4-23. Use

2402 the optimization suggested by exercise 5.2.4-10; this requires room

2403 for only 1.5*N lines, rather than the usual 2*N lines. Knuth

2404 writes that this memory optimization was originally published by

2405 D. A. Bell, Comp J. 1 (1958), 75. */

2406

2407 static void

2408 sortlines (struct line *lines, size_t nlines, struct line *temp)

2409 {

2410 if (nlines == 2)

2411 {

2412 if (0 < compare (&lines[-1], &lines[-2]))

2413 {

2414 struct line tmp = lines[-1];

2415 lines[-1] = lines[-2];

2416 lines[-2] = tmp;

2417 }

2418 }

2419 else

2420 {

2421 size_t nlo = nlines / 2;

2422 size_t nhi = nlines - nlo;

2423 struct line *lo = lines;

2424 struct line *hi = lines - nlo;

2425 struct line *sorted_lo = temp;

2426

2427 sortlines (hi, nhi, temp);

2428 if (1 < nlo)

2429 sortlines_temp (lo, nlo, sorted_lo);

2430 else

2431 sorted_lo[-1] = lo[-1];

2432

2433 mergelines (lines, sorted_lo, nlo, hi, nhi);

2434 }

2435 }

2436

2437 /* Like sortlines (LINES, NLINES, TEMP), except output into TEMP

2438 rather than sorting in place. */

2439

2440 static void

2441 sortlines_temp (struct line *lines, size_t nlines, struct line *temp)

2442 {

2443 if (nlines == 2)

2444 {

2445 /* Declare `swap' as int, not bool, to work around a bug

2446 <http://lists.gnu.org/archive/html/bug-coreutils/2005-10/msg00086.html>

2447 in the IBM xlc 6.0.0.0 compiler in 64-bit mode. */

2448 int swap = (0 < compare (&lines[-1], &lines[-2]));

2449 temp[-1] = lines[-1 - swap];

2450 temp[-2] = lines[-2 + swap];

2451 }

2452 else

2453 {

2454 size_t nlo = nlines / 2;

2455 size_t nhi = nlines - nlo;

2456 struct line *lo = lines;

2457 struct line *hi = lines - nlo;

2458 struct line *sorted_hi = temp - nlo;

2459

2460 sortlines_temp (hi, nhi, sorted_hi);

2461 if (1 < nlo)

2462 sortlines (lo, nlo, temp);

2463

2464 mergelines (temp, lo, nlo, sorted_hi, nhi);

2465 }

2466 }

2467

2468 /* Scan through FILES[NTEMPS .. NFILES-1] looking for a file that is

2469 the same as OUTFILE. If found, merge the found instances (and perhaps

2470 some other files) into a temporary file so that it can in turn be

2471 merged into OUTFILE without destroying OUTFILE before it is completely

2472 read. Return the new value of NFILES, which differs from the old if

2473 some merging occurred.

2474

2475 This test ensures that an otherwise-erroneous use like

2476 "sort -m -o FILE ... FILE ..." copies FILE before writing to it.

2477 It's not clear that POSIX requires this nicety.

2478 Detect common error cases, but don't try to catch obscure cases like

2479 "cat ... FILE ... | sort -m -o FILE"

2480 where traditional "sort" doesn't copy the input and where

2481 people should know that they're getting into trouble anyway.

2482 Catching these obscure cases would slow down performance in

2483 common cases. */

2484

2485 static size_t

2486 avoid_trashing_input (struct sortfile *files, size_t ntemps,

2487 size_t nfiles, char const *outfile)

2488 {

2489 size_t i;

2490 bool got_outstat = false;

2491 struct stat outstat;

2492

2493 for (i = ntemps; i < nfiles; i++)

2494 {

2495 bool is_stdin = STREQ (files[i].name, "-");

2496 bool same;

2497 struct stat instat;

2498

2499 if (outfile && STREQ (outfile, files[i].name) && !is_stdin)

2500 same = true;

2501 else

2502 {

2503 if (! got_outstat)

2504 {

2505 if ((outfile

2506 ? stat (outfile, &outstat)

2507 : fstat (STDOUT_FILENO, &outstat))

2508 != 0)

2509 break;

2510 got_outstat = true;

2511 }

2512

2513 same = (((is_stdin

2514 ? fstat (STDIN_FILENO, &instat)

2515 : stat (files[i].name, &instat))

2516 == 0)

2517 && SAME_INODE (instat, outstat));

2518 }

2519

2520 if (same)

2521 {

2522 FILE *tftp;

2523 pid_t pid;

2524 char *temp = create_temp (&tftp, &pid);

2525 mergefps (&files[i],0, nfiles - i, tftp, temp);

2526 files[i].name = temp;

2527 files[i].pid = pid;

2528 return i + 1;

2529 }

2530 }

2531

2532 return nfiles;

2533 }

2534

2535 /* Merge the input FILES. NTEMPS is the number of files at the

2536 start of FILES that are temporary; it is zero at the top level.

2537 NFILES is the total number of files. Put the output in

2538 OUTPUT_FILE; a null OUTPUT_FILE stands for standard output. */

2539

2540 static void

2541 merge (struct sortfile *files, size_t ntemps, size_t nfiles,

2542 char const *output_file)

2543 {

2544 while (nmerge < nfiles)

2545 {

2546 /* Number of input files processed so far. */

2547 size_t in;

2548

2549 /* Number of output files generated so far. */

2550 size_t out;

2551

2552 /* nfiles % NMERGE; this counts input files that are left over

2553 after all full-sized merges have been done. */

2554 size_t remainder;

2555

2556 /* Number of easily-available slots at the next loop iteration. */

2557 size_t cheap_slots;

2558

2559 /* Do as many NMERGE-size merges as possible. */

2560 for (out = in = 0; out < nfiles / nmerge; out++, in += nmerge)

2561 {

2562 FILE *tfp;

2563 pid_t pid;

2564 char *temp = create_temp (&tfp, &pid);

2565 size_t nt = MIN (ntemps, nmerge);

2566 ntemps -= nt;

2567 mergefps (&files[in], nt, nmerge, tfp, temp);

2568 files[out].name = temp;

2569 files[out].pid = pid;

2570 }

2571

2572 remainder = nfiles - in;

2573 cheap_slots = nmerge - out % nmerge;

2574

2575 if (cheap_slots < remainder)

2576 {

2577 /* So many files remain that they can't all be put into the last

2578 NMERGE-sized output window. Do one more merge. Merge as few

2579 files as possible, to avoid needless I/O. */

2580 size_t nshortmerge = remainder - cheap_slots + 1;

2581 FILE *tfp;

2582 pid_t pid;

2583 char *temp = create_temp (&tfp, &pid);

2584 size_t nt = MIN (ntemps, nshortmerge);

2585 ntemps -= nt;

2586 mergefps (&files[in], nt, nshortmerge, tfp, temp);

2587 files[out].name = temp;

2588 files[out++].pid = pid;

2589 in += nshortmerge;

2590 }

2591

2592 /* Put the remaining input files into the last NMERGE-sized output

2593 window, so they will be merged in the next pass. */

2594 memmove(&files[out], &files[in], (nfiles - in) * sizeof *files);

2595 ntemps += out;

2596 nfiles -= in - out;

2597 }

2598

2599 nfiles = avoid_trashing_input (files, ntemps, nfiles, output_file);

2600 mergefps (files, ntemps, nfiles, NULL, output_file);

2601 }

2602

2603 /* Sort NFILES FILES onto OUTPUT_FILE. */

2604

2605 static void

2606 sort (char * const *files, size_t nfiles, char const *output_file)

2607 {

2608 struct buffer buf;

2609 size_t ntemps = 0;

2610 bool output_file_created = false;

2611

2612 buf.alloc = 0;

2613

2614 while (nfiles)

2615 {

2616 char const *temp_output;

2617 char const *file = *files;

2618 FILE *fp = xfopen (file, "r");

2619 FILE *tfp;

2620 size_t bytes_per_line = (2 * sizeof (struct line)

2621 - sizeof (struct line) / 2);

2622

2623 if (! buf.alloc)

2624 initbuf (&buf, bytes_per_line,

2625 sort_buffer_size (&fp, 1, files, nfiles, bytes_per_line));

2626 buf.eof = false;

2627 files++;

2628 nfiles--;

2629

2630 while (fillbuf (&buf, fp, file))

2631 {

2632 struct line *line;

2633 struct line *linebase;

2634

2635 if (buf.eof && nfiles

2636 && (bytes_per_line + 1

2637 < (buf.alloc - buf.used - bytes_per_line * buf.nlines)))

2638 {

2639 /* End of file, but there is more input and buffer room.

2640 Concatenate the next input file; this is faster in

2641 the usual case. */

2642 buf.left = buf.used;

2643 break;

2644 }

2645

2646 line = buffer_linelim (&buf);

2647 linebase = line - buf.nlines;

2648 if (1 < buf.nlines)

2649 sortlines (line, buf.nlines, linebase);

2650 if (buf.eof && !nfiles && !ntemps && !buf.left)

2651 {

2652 xfclose (fp, file);

2653 tfp = xfopen (output_file, "w");

2654 temp_output = output_file;

2655 output_file_created = true;

2656 }

2657 else

2658 {

2659 ++ntemps;

2660 temp_output = create_temp (&tfp, NULL);

2661 }

2662

2663 do

2664 {

2665 line--;

2666 write_bytes (line->text, line->length, tfp, temp_output);

2667 if (unique)

2668 while (linebase < line && compare (line, line - 1) == 0)

2669 line--;

2670 }

2671 while (linebase < line);

2672

2673 xfclose (tfp, temp_output);

2674

2675 /* Free up some resources every once in a while. */

2676 if (MAX_PROCS_BEFORE_REAP < nprocs)

2677 reap_some ();

2678

2679 if (output_file_created)

2680 goto finish;

2681 }

2682 xfclose (fp, file);

2683 }

2684

2685 finish:

2686 free (buf.buf);

2687

2688 if (! output_file_created)

2689 {

2690 size_t i;

2691 struct tempnode *node = temphead;

2692 struct sortfile *tempfiles = xnmalloc (ntemps, sizeof *tempfiles);

2693 for (i = 0; node; i++)

2694 {

2695 tempfiles[i].name = node->name;

2696 tempfiles[i].pid = node->pid;

2697 node = node->next;

2698 }

2699 merge (tempfiles, ntemps, ntemps, output_file);

2700 free (tempfiles);

2701 }

2702 }

2703

2704 /* Insert a malloc'd copy of key KEY_ARG at the end of the key list. */

2705

2706 static void

2707 insertkey (struct keyfield *key_arg)

2708 {

2709 struct keyfield **p;

2710 struct keyfield *key = xmemdup (key_arg, sizeof *key);

2711

2712 for (p = &keylist; *p; p = &(*p)->next)

2713 continue;

2714 *p = key;

2715 key->next = NULL;

2716 }

2717

2718 /* Report a bad field specification SPEC, with extra info MSGID. */

2719

2720 static void badfieldspec (char const *, char const *)

2721 ATTRIBUTE_NORETURN;

2722 static void

2723 badfieldspec (char const *spec, char const *msgid)

2724 {

2725 error (SORT_FAILURE, 0, _("%s: invalid field specification %s"),

2726 _(msgid), quote (spec));

2727 abort ();

2728 }

2729

2730 /* Report incompatible options. */

2731

2732 static void incompatible_options (char const *) ATTRIBUTE_NORETURN;

2733 static void

2734 incompatible_options (char const *opts)

2735 {

2736 error (SORT_FAILURE, 0, _("options `-%s' are incompatible"), opts);

2737 abort ();

2738 }

2739

2740 /* Check compatibility of ordering options. */

2741

2742 static void

2743 check_ordering_compatibility (void)

2744 {

2745 struct keyfield const *key;

2746

2747 for (key = keylist; key; key = key->next)

2748 if ((1 < (key->random + key->numeric + key->general_numeric + key->month

2749 + key->version + !!key->ignore))

2750 || (key->random && key->translate))

2751 {

2752 /* The following is too big, but guaranteed to be "big enough". */

2753 char opts[sizeof short_options];

2754 char *p = opts;

2755 if (key->ignore == nondictionary)

2756 *p++ = 'd';

2757 if (key->translate)

2758 *p++ = 'f';

2759 if (key->general_numeric)

2760 *p++ = 'g';

2761 if (key->ignore == nonprinting)

2762 *p++ = 'i';

2763 if (key->month)

2764 *p++ = 'M';

2765 if (key->numeric)

2766 *p++ = 'n';

2767 if (key->version)

2768 *p++ = 'V';

2769 if (key->random)

2770 *p++ = 'R';

2771 *p = '\0';

2772 incompatible_options (opts);

2773 }

2774 }

2775

2776 /* Parse the leading integer in STRING and store the resulting value

2777 (which must fit into size_t) into *VAL. Return the address of the

2778 suffix after the integer. If the value is too large, silently

2779 substitute SIZE_MAX. If MSGID is NULL, return NULL after

2780 failure; otherwise, report MSGID and exit on failure. */

2781

2782 static char const *

2783 parse_field_count (char const *string, size_t *val, char const *msgid)

2784 {

2785 char *suffix;

2786 uintmax_t n;

2787

2788 switch (xstrtoumax (string, &suffix, 10, &n, ""))

2789 {

2790 case LONGINT_OK:

2791 case LONGINT_INVALID_SUFFIX_CHAR:

2792 *val = n;

2793 if (*val == n)

2794 break;

2795 /* Fall through. */

2796 case LONGINT_OVERFLOW:

2797 case LONGINT_OVERFLOW | LONGINT_INVALID_SUFFIX_CHAR:

2798 *val = SIZE_MAX;

2799 break;

2800

2801 case LONGINT_INVALID:

2802 if (msgid)

2803 error (SORT_FAILURE, 0, _("%s: invalid count at start of %s"),

2804 _(msgid), quote (string));

2805 return NULL;

2806 }

2807

2808 return suffix;

2809 }

2810

2811 /* Handle interrupts and hangups. */

2812

2813 static void

2814 sighandler (int sig)

2815 {

2816 if (! SA_NOCLDSTOP)

2817 signal (sig, SIG_IGN);

2818

2819 cleanup ();

2820

2821 signal (sig, SIG_DFL);

2822 raise (sig);

2823 }

2824

2825 /* Set the ordering options for KEY specified in S.

2826 Return the address of the first character in S that

2827 is not a valid ordering option.

2828 BLANKTYPE is the kind of blanks that 'b' should skip. */

2829

2830 static char *

2831 set_ordering (const char *s, struct keyfield *key, enum blanktype blanktype)

2832 {

2833 while (*s)

2834 {

2835 switch (*s)

2836 {

2837 case 'b':

2838 if (blanktype == bl_start || blanktype == bl_both)

2839 key->skipsblanks = true;

2840 if (blanktype == bl_end || blanktype == bl_both)

2841 key->skipeblanks = true;

2842 break;

2843 case 'd':

2844 key->ignore = nondictionary;

2845 break;

2846 case 'f':

2847 key->translate = fold_toupper;

2848 break;

2849 case 'g':

2850 key->general_numeric = true;

2851 break;

2852 case 'i':

2853 /* Option order should not matter, so don't let -i override

2854 -d. -d implies -i, but -i does not imply -d. */

2855 if (! key->ignore)

2856 key->ignore = nonprinting;

2857 break;

2858 case 'M':

2859 key->month = true;

2860 break;

2861 case 'n':

2862 key->numeric = true;

2863 break;

2864 case 'R':

2865 key->random = true;

2866 break;

2867 case 'r':

2868 key->reverse = true;

2869 break;

2870 case 'V':

2871 key->version = true;

2872 break;

2873 default:

2874 return (char *) s;

2875 }

2876 ++s;

2877 }

2878 return (char *) s;

2879 }

2880

2881 static struct keyfield *

2882 key_init (struct keyfield *key)

2883 {

2884 memset (key, 0, sizeof *key);

2885 key->eword = SIZE_MAX;

2886 return key;

2887 }

2888

2889 int

2890 main (int argc, char **argv)

2891 {

2892 struct keyfield *key;

2893 struct keyfield key_buf;

2894 struct keyfield gkey;

2895 char const *s;

2896 int c = 0;

2897 char checkonly = 0;

2898 bool mergeonly = false;

2899 char *random_source = NULL;

2900 bool need_random = false;

2901 size_t nfiles = 0;

2902 bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);

2903 bool obsolete_usage = (posix2_version () < 200112);

2904 char **files;

2905 char *files_from = NULL;

2906 struct Tokens tok;

2907 char const *outfile = NULL;

2908

2909 initialize_main (&argc, &argv);

2910 set_program_name (argv[0]);

2911 setlocale (LC_ALL, "");

2912 bindtextdomain (PACKAGE, LOCALEDIR);

2913 textdomain (PACKAGE);

2914

2915 initialize_exit_failure (SORT_FAILURE);

2916

2917 hard_LC_COLLATE = hard_locale (LC_COLLATE);

2918 #if HAVE_NL_LANGINFO

2919 hard_LC_TIME = hard_locale (LC_TIME);

2920 #endif

2921

2922 /* Get locale's representation of the decimal point. */

2923 {

2924 struct lconv const *locale = localeconv ();

2925

2926 /* If the locale doesn't define a decimal point, or if the decimal

2927 point is multibyte, use the C locale's decimal point. FIXME:

2928 add support for multibyte decimal points. */

2929 decimal_point = to_uchar (locale->decimal_point[0]);

2930 if (! decimal_point || locale->decimal_point[1])

2931 decimal_point = '.';

2932

2933 /* FIXME: add support for multibyte thousands separators. */

2934 thousands_sep = to_uchar (*locale->thousands_sep);

2935 if (! thousands_sep || locale->thousands_sep[1])

2936 thousands_sep = -1;

2937 }

2938

2939 have_read_stdin = false;

2940 inittables ();

2941

2942 {

2943 size_t i;

2944 static int const sig[] =

2945 {

2946 /* The usual suspects. */

2947 SIGALRM, SIGHUP, SIGINT, SIGPIPE, SIGQUIT, SIGTERM,

2948 #ifdef SIGPOLL

2949 SIGPOLL,

2950 #endif

2951 #ifdef SIGPROF

2952 SIGPROF,

2953 #endif

2954 #ifdef SIGVTALRM

2955 SIGVTALRM,

2956 #endif

2957 #ifdef SIGXCPU

2958 SIGXCPU,

2959 #endif

2960 #ifdef SIGXFSZ

2961 SIGXFSZ,

2962 #endif

2963 };

2964 enum { nsigs = sizeof sig / sizeof sig[0] };

2965

2966 #if SA_NOCLDSTOP

2967 struct sigaction act;

2968

2969 sigemptyset (&caught_signals);

2970 for (i = 0; i < nsigs; i++)

2971 {

2972 sigaction (sig[i], NULL, &act);

2973 if (act.sa_handler != SIG_IGN)

2974 sigaddset (&caught_signals, sig[i]);

2975 }

2976

2977 act.sa_handler = sighandler;

2978 act.sa_mask = caught_signals;

2979 act.sa_flags = 0;

2980

2981 for (i = 0; i < nsigs; i++)

2982 if (sigismember (&caught_signals, sig[i]))

2983 sigaction (sig[i], &act, NULL);

2984 #else

2985 for (i = 0; i < nsigs; i++)

2986 if (signal (sig[i], SIG_IGN) != SIG_IGN)

2987 {

2988 signal (sig[i], sighandler);

2989 siginterrupt (sig[i], 1);

2990 }

2991 #endif

2992 }

2993

2994 /* The signal mask is known, so it is safe to invoke exit_cleanup. */

2995 atexit (exit_cleanup);

2996

2997 gkey.sword = gkey.eword = SIZE_MAX;

2998 gkey.ignore = NULL;

2999 gkey.translate = NULL;

3000 gkey.numeric = gkey.general_numeric = gkey.random = gkey.version = false;

3001 gkey.month = gkey.reverse = false;

3002 gkey.skipsblanks = gkey.skipeblanks = false;

3003

3004 files = xnmalloc (argc, sizeof *files);

3005

3006 for (;;)

3007 {

3008 /* Parse an operand as a file after "--" was seen; or if

3009 pedantic and a file was seen, unless the POSIX version

3010 predates 1003.1-2001 and -c was not seen and the operand is

3011 "-o FILE" or "-oFILE". */

3012 int oi = -1;

3013

3014 if (c == -1

3015 || (posixly_correct && nfiles != 0

3016 && ! (obsolete_usage

3017 && ! checkonly

3018 && optind != argc

3019 && argv[optind][0] == '-' && argv[optind][1] == 'o'

3020 && (argv[optind][2] || optind + 1 != argc)))

3021 || ((c = getopt_long (argc, argv, short_options,

3022 long_options, &oi))

3023 == -1))

3024 {

3025 if (argc <= optind)

3026 break;

3027 files[nfiles++] = argv[optind++];

3028 }

3029 else switch (c)

3030 {

3031 case 1:

3032 key = NULL;

3033 if (optarg[0] == '+')

3034 {

3035 bool minus_pos_usage = (optind != argc && argv[optind][0] == '-'

3036 && ISDIGIT (argv[optind][1]));

3037 obsolete_usage |= minus_pos_usage & ~posixly_correct;

3038 if (obsolete_usage)

3039 {

3040 /* Treat +POS1 [-POS2] as a key if possible; but silently

3041 treat an operand as a file if it is not a valid +POS1. */

3042 key = key_init (&key_buf);

3043 s = parse_field_count (optarg + 1, &key->sword, NULL);

3044 if (s && *s == '.')

3045 s = parse_field_count (s + 1, &key->schar, NULL);

3046 if (! (key->sword | key->schar))

3047 key->sword = SIZE_MAX;

3048 if (! s || *set_ordering (s, key, bl_start))

3049 key = NULL;

3050 else

3051 {

3052 if (minus_pos_usage)

3053 {

3054 char const *optarg1 = argv[optind++];

3055 s = parse_field_count (optarg1 + 1, &key->eword,

3056 N_("invalid number after `-'"));

3057 if (*s == '.')

3058 s = parse_field_count (s + 1, &key->echar,

3059 N_("invalid number after `.'"));

3060 if (*set_ordering (s, key, bl_end))

3061 badfieldspec (optarg1,

3062 N_("stray character in field spec"));

3063 }

3064 insertkey (key);

3065 }

3066 }

3067 }

3068 if (! key)

3069 files[nfiles++] = optarg;

3070 break;

3071

3072 case SORT_OPTION:

3073 c = XARGMATCH ("--sort", optarg, sort_args, sort_types);

3074 /* Fall through. */

3075 case 'b':

3076 case 'd':

3077 case 'f':

3078 case 'g':

3079 case 'i':

3080 case 'M':

3081 case 'n':

3082 case 'r':

3083 case 'R':

3084 case 'V':

3085 {

3086 char str[2];

3087 str[0] = c;

3088 str[1] = '\0';

3089 set_ordering (str, &gkey, bl_both);

3090 }

3091 break;

3092

3093 case CHECK_OPTION:

3094 c = (optarg

3095 ? XARGMATCH ("--check", optarg, check_args, check_types)

3096 : 'c');

3097 /* Fall through. */

3098 case 'c':

3099 case 'C':

3100 if (checkonly && checkonly != c)

3101 incompatible_options ("cC");

3102 checkonly = c;

3103 break;

3104

3105 case COMPRESS_PROGRAM_OPTION:

3106 if (compress_program && !STREQ (compress_program, optarg))

3107 error (SORT_FAILURE, 0, _("multiple compress programs specified"));

3108 compress_program = optarg;

3109 break;

3110

3111 case FILES0_FROM_OPTION:

3112 files_from = optarg;

3113 break;

3114

3115 case 'k':

3116 key = key_init (&key_buf);

3117

3118 /* Get POS1. */

3119 s = parse_field_count (optarg, &key->sword,

3120 N_("invalid number at field start"));

3121 if (! key->sword--)

3122 {

3123 /* Provoke with `sort -k0' */

3124 badfieldspec (optarg, N_("field number is zero"));

3125 }

3126 if (*s == '.')

3127 {

3128 s = parse_field_count (s + 1, &key->schar,

3129 N_("invalid number after `.'"));

3130 if (! key->schar--)

3131 {

3132 /* Provoke with `sort -k1.0' */

3133 badfieldspec (optarg, N_("character offset is zero"));

3134 }

3135 }

3136 if (! (key->sword | key->schar))

3137 key->sword = SIZE_MAX;

3138 s = set_ordering (s, key, bl_start);

3139 if (*s != ',')

3140 {

3141 key->eword = SIZE_MAX;

3142 key->echar = 0;

3143 }

3144 else

3145 {

3146 /* Get POS2. */

3147 s = parse_field_count (s + 1, &key->eword,

3148 N_("invalid number after `,'"));

3149 if (! key->eword--)

3150 {

3151 /* Provoke with `sort -k1,0' */

3152 badfieldspec (optarg, N_("field number is zero"));

3153 }

3154 if (*s == '.')

3155 s = parse_field_count (s + 1, &key->echar,

3156 N_("invalid number after `.'"));

3157 else

3158 {

3159 /* `-k 2,3' is equivalent to `+1 -3'. */

3160 key->eword++;

3161 }

3162 s = set_ordering (s, key, bl_end);

3163 }

3164 if (*s)

3165 badfieldspec (optarg, N_("stray character in field spec"));

3166 insertkey (key);

3167 break;

3168

3169 case 'm':

3170 mergeonly = true;

3171 break;

3172

3173 case NMERGE_OPTION:

3174 specify_nmerge (oi, c, optarg);

3175 break;

3176

3177 case 'o':

3178 if (outfile && !STREQ (outfile, optarg))

3179 error (SORT_FAILURE, 0, _("multiple output files specified"));

3180 outfile = optarg;

3181 break;

3182

3183 case RANDOM_SOURCE_OPTION:

3184 if (random_source && !STREQ (random_source, optarg))

3185 error (SORT_FAILURE, 0, _("multiple random sources specified"));

3186 random_source = optarg;

3187 break;

3188

3189 case 's':

3190 stable = true;

3191 break;

3192

3193 case 'S':

3194 specify_sort_size (oi, c, optarg);

3195 break;

3196

3197 case 't':

3198 {

3199 char newtab = optarg[0];

3200 if (! newtab)

3201 error (SORT_FAILURE, 0, _("empty tab"));

3202 if (optarg[1])

3203 {

3204 if (STREQ (optarg, "\\0"))

3205 newtab = '\0';

3206 else

3207 {

3208 /* Provoke with `sort -txx'. Complain about

3209 "multi-character tab" instead of "multibyte tab", so

3210 that the diagnostic's wording does not need to be

3211 changed once multibyte characters are supported. */

3212 error (SORT_FAILURE, 0, _("multi-character tab %s"),

3213 quote (optarg));

3214 }

3215 }

3216 if (tab != TAB_DEFAULT && tab != newtab)

3217 error (SORT_FAILURE, 0, _("incompatible tabs"));

3218 tab = newtab;

3219 }

3220 break;

3221

3222 case 'T':

3223 add_temp_dir (optarg);

3224 break;

3225

3226 case 'u':

3227 unique = true;

3228 break;

3229

3230 case 'y':

3231 /* Accept and ignore e.g. -y0 for compatibility with Solaris 2.x

3232 through Solaris 7. It is also accepted by many non-Solaris

3233 "sort" implementations, e.g., AIX 5.2, HP-UX 11i v2, IRIX 6.5.

3234 -y is marked as obsolete starting with Solaris 8 (1999), but is

3235 still accepted as of Solaris 10 prerelease (2004).

3236

3237 Solaris 2.5.1 "sort -y 100" reads the input file "100", but

3238 emulate Solaris 8 and 9 "sort -y 100" which ignores the "100",

3239 and which in general ignores the argument after "-y" if it

3240 consists entirely of digits (it can even be empty). */

3241 if (optarg == argv[optind - 1])

3242 {

3243 char const *p;

3244 for (p = optarg; ISDIGIT (*p); p++)

3245 continue;

3246 optind -= (*p != '\0');

3247 }

3248 break;

3249

3250 case 'z':

3251 eolchar = 0;

3252 break;

3253

3254 case_GETOPT_HELP_CHAR;

3255

3256 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);

3257

3258 default:

3259 usage (SORT_FAILURE);

3260 }

3261 }

3262

3263 if (files_from)

3264 {

3265 FILE *stream;

3266

3267 /* When using --files0-from=F, you may not specify any files

3268 on the command-line. */

3269 if (nfiles)

3270 {

3271 error (0, 0, _("extra operand %s"), quote (files[0]));

3272 fprintf (stderr, "%s\n",

3273 _("file operands cannot be combined with --files0-from"));

3274 usage (SORT_FAILURE);

3275 }

3276

3277 if (STREQ (files_from, "-"))

3278 stream = stdin;

3279 else

3280 {

3281 stream = fopen (files_from, "r");

3282 if (stream == NULL)

3283 error (SORT_FAILURE, errno, _("cannot open %s for reading"),

3284 quote (files_from));

3285 }

3286

3287 readtokens0_init (&tok);

3288

3289 if (! readtokens0 (stream, &tok) || fclose (stream) != 0)

3290 error (SORT_FAILURE, 0, _("cannot read file names from %s"),

3291 quote (files_from));

3292

3293 if (tok.n_tok)

3294 {

3295 size_t i;

3296 free (files);

3297 files = tok.tok;

3298 nfiles = tok.n_tok;

3299 for (i = 0; i < nfiles; i++)

3300 {

3301 if (STREQ (files[i], "-"))

3302 error (SORT_FAILURE, 0, _("when reading file names from stdin, "

3303 "no file name of %s allowed"),

3304 quote (files[i]));

3305 else if (files[i][0] == '\0')

3306 {

3307 /* Using the standard `filename:line-number:' prefix here is

3308 not totally appropriate, since NUL is the separator, not NL,

3309 but it might be better than nothing. */

3310 unsigned long int file_number = i + 1;

3311 error (SORT_FAILURE, 0,

3312 _("%s:%lu: invalid zero-length file name"),

3313 quotearg_colon (files_from), file_number);

3314 }

3315 }

3316 }

3317 else

3318 error (SORT_FAILURE, 0, _("no input from %s"),

3319 quote (files_from));

3320 }

3321

3322 /* Inheritance of global options to individual keys. */

3323 for (key = keylist; key; key = key->next)

3324 {

3325 if (! (key->ignore

3326 || key->translate

3327 || (key->skipsblanks

3328 | key->reverse

3329 | key->skipeblanks

3330 | key->month

3331 | key->numeric

3332 | key->version

3333 | key->general_numeric

3334 | key->random)))

3335 {

3336 key->ignore = gkey.ignore;

3337 key->translate = gkey.translate;

3338 key->skipsblanks = gkey.skipsblanks;

3339 key->skipeblanks = gkey.skipeblanks;

3340 key->month = gkey.month;

3341 key->numeric = gkey.numeric;

3342 key->general_numeric = gkey.general_numeric;

3343 key->random = gkey.random;

3344 key->reverse = gkey.reverse;

3345 key->version = gkey.version;

3346 }

3347

3348 need_random |= key->random;

3349 }

3350

3351 if (!keylist && (gkey.ignore

3352 || gkey.translate

3353 || (gkey.skipsblanks

3354 | gkey.skipeblanks

3355 | gkey.month

3356 | gkey.numeric

3357 | gkey.general_numeric

3358 | gkey.random

3359 | gkey.version)))

3360 {

3361 insertkey (&gkey);

3362 need_random |= gkey.random;

3363 }

3364

3365 check_ordering_compatibility ();

3366

3367 reverse = gkey.reverse;

3368

3369 if (need_random)

3370 {

3371 randread_source = randread_new (random_source, MD5_DIGEST_SIZE);

3372 if (! randread_source)

3373 die (_("open failed"), random_source);

3374 }

3375

3376 if (temp_dir_count == 0)

3377 {

3378 char const *tmp_dir = getenv ("TMPDIR");

3379 add_temp_dir (tmp_dir ? tmp_dir : DEFAULT_TMPDIR);

3380 }

3381

3382 if (nfiles == 0)

3383 {

3384 static char *minus = (char *) "-";

3385 nfiles = 1;

3386 free (files);

3387 files = −

3388 }

3389

3390 /* Need to re-check that we meet the minimum requirement for memory

3391 usage with the final value for NMERGE. */

3392 if (0 < sort_size)

3393 sort_size = MAX (sort_size, MIN_SORT_SIZE);

3394

3395 if (checkonly)

3396 {

3397 if (nfiles > 1)

3398 error (SORT_FAILURE, 0, _("extra operand %s not allowed with -%c"),

3399 quote (files[1]), checkonly);

3400

3401 if (outfile)

3402 {

3403 static char opts[] = {0, 'o', 0};

3404 opts[0] = checkonly;

3405 incompatible_options (opts);

3406 }

3407

3408 /* POSIX requires that sort return 1 IFF invoked with -c or -C and the

3409 input is not properly sorted. */

3410 exit (check (files[0], checkonly) ? EXIT_SUCCESS : SORT_OUT_OF_ORDER);

3411 }

3412

3413 if (mergeonly)

3414 {

3415 struct sortfile *sortfiles = xcalloc (nfiles, sizeof *sortfiles);

3416 size_t i;

3417

3418 for (i = 0; i < nfiles; ++i)

3419 sortfiles[i].name = files[i];

3420

3421 merge (sortfiles, 0, nfiles, outfile);

3422 IF_LINT (free (sortfiles));

3423 }

3424 else

3425 sort (files, nfiles, outfile);

3426

3427 if (have_read_stdin && fclose (stdin) == EOF)

3428 die (_("close failed"), "-");

3429

3430 exit (EXIT_SUCCESS);

3431 }