From 465d1b0d6518c5d980f2db4c2d769f9905bdd902 Mon Sep 17 00:00:00 2001 From: Lasse Collin Date: Wed, 25 Nov 2009 11:19:20 +0200 Subject: [PATCH] Create sparse files by default when decompressing into a regular file. Sparse file creation can be disabled with --no-sparse. I don't promise yet that the name of this option won't change before 5.0.0. It's possible that the code, that checks when it is safe to use sparse output on stdout, is not good enough, and a more flexible command line option is needed to configure sparse file handling. --- src/xz/args.c | 6 ++ src/xz/coder.c | 33 +++---- src/xz/file_io.c | 243 ++++++++++++++++++++++++++++++++++++++++------- src/xz/file_io.h | 34 +++++-- src/xz/message.c | 1 + src/xz/xz.1 | 11 +++ 6 files changed, 272 insertions(+), 56 deletions(-) diff --git a/src/xz/args.c b/src/xz/args.c index 75b6220..bb6e27b 100644 --- a/src/xz/args.c +++ b/src/xz/args.c @@ -43,6 +43,7 @@ parse_real(args_info *args, int argc, char **argv) OPT_LZMA1, OPT_LZMA2, + OPT_NO_SPARSE, OPT_FILES, OPT_FILES0, OPT_INFO_MEMORY, @@ -65,6 +66,7 @@ parse_real(args_info *args, int argc, char **argv) { "force", no_argument, NULL, 'f' }, { "stdout", no_argument, NULL, 'c' }, { "to-stdout", no_argument, NULL, 'c' }, + { "no-sparse", no_argument, NULL, OPT_NO_SPARSE }, { "suffix", required_argument, NULL, 'S' }, // { "recursive", no_argument, NULL, 'r' }, // TODO { "files", optional_argument, NULL, OPT_FILES }, @@ -339,6 +341,10 @@ parse_real(args_info *args, int argc, char **argv) break; } + case OPT_NO_SPARSE: + io_no_sparse(); + break; + case OPT_FILES: args->files_delim = '\n'; diff --git a/src/xz/coder.c b/src/xz/coder.c index 7cf6186..d58e7e3 100644 --- a/src/xz/coder.c +++ b/src/xz/coder.c @@ -33,8 +33,8 @@ static lzma_stream strm = LZMA_STREAM_INIT; static lzma_filter filters[LZMA_FILTERS_MAX + 1]; /// Input and output buffers -static uint8_t in_buf[IO_BUFFER_SIZE]; -static uint8_t out_buf[IO_BUFFER_SIZE]; +static io_buf in_buf; +static io_buf out_buf; /// Number of filters. Zero indicates that we are using a preset. static size_t filters_count = 0; @@ -275,7 +275,7 @@ coder_set_compression_settings(void) static bool is_format_xz(void) { - return strm.avail_in >= 6 && memcmp(in_buf, "\3757zXZ", 6) == 0; + return strm.avail_in >= 6 && memcmp(in_buf.u8, "\3757zXZ", 6) == 0; } @@ -289,7 +289,7 @@ is_format_lzma(void) // Decode the LZMA1 properties. lzma_filter filter = { .id = LZMA_FILTER_LZMA1 }; - if (lzma_properties_decode(&filter, NULL, in_buf, 5) != LZMA_OK) + if (lzma_properties_decode(&filter, NULL, in_buf.u8, 5) != LZMA_OK) return false; // A hack to ditch tons of false positives: We allow only dictionary @@ -317,7 +317,7 @@ is_format_lzma(void) // Again, if someone complains, this will be reconsidered. uint64_t uncompressed_size = 0; for (size_t i = 0; i < 8; ++i) - uncompressed_size |= (uint64_t)(in_buf[5 + i]) << (i * 8); + uncompressed_size |= (uint64_t)(in_buf.u8[5 + i]) << (i * 8); if (uncompressed_size != UINT64_MAX && uncompressed_size > (UINT64_C(1) << 38)) @@ -444,15 +444,16 @@ coder_normal(file_pair *pair) // Assume that something goes wrong. bool success = false; - strm.next_out = out_buf; + strm.next_out = out_buf.u8; strm.avail_out = IO_BUFFER_SIZE; while (!user_abort) { // Fill the input buffer if it is empty and we haven't reached // end of file yet. if (strm.avail_in == 0 && !pair->src_eof) { - strm.next_in = in_buf; - strm.avail_in = io_read(pair, in_buf, IO_BUFFER_SIZE); + strm.next_in = in_buf.u8; + strm.avail_in = io_read( + pair, &in_buf, IO_BUFFER_SIZE); if (strm.avail_in == SIZE_MAX) break; @@ -466,11 +467,11 @@ coder_normal(file_pair *pair) // Write out if the output buffer became full. if (strm.avail_out == 0) { - if (opt_mode != MODE_TEST && io_write(pair, out_buf, + if (opt_mode != MODE_TEST && io_write(pair, &out_buf, IO_BUFFER_SIZE - strm.avail_out)) break; - strm.next_out = out_buf; + strm.next_out = out_buf.u8; strm.avail_out = IO_BUFFER_SIZE; } @@ -487,7 +488,7 @@ coder_normal(file_pair *pair) // when trying to get at least some useful // data out of damaged files. if (opt_mode != MODE_TEST && io_write(pair, - out_buf, IO_BUFFER_SIZE + &out_buf, IO_BUFFER_SIZE - strm.avail_out)) break; } @@ -502,7 +503,7 @@ coder_normal(file_pair *pair) // input, and thus pair->src_eof // becomes true. strm.avail_in = io_read( - pair, in_buf, 1); + pair, &in_buf, 1); if (strm.avail_in == SIZE_MAX) break; @@ -579,14 +580,14 @@ coder_passthru(file_pair *pair) if (user_abort) return false; - if (io_write(pair, in_buf, strm.avail_in)) + if (io_write(pair, &in_buf, strm.avail_in)) return false; strm.total_in += strm.avail_in; strm.total_out = strm.total_in; message_progress_update(); - strm.avail_in = io_read(pair, in_buf, IO_BUFFER_SIZE); + strm.avail_in = io_read(pair, &in_buf, IO_BUFFER_SIZE); if (strm.avail_in == SIZE_MAX) return false; } @@ -613,8 +614,8 @@ coder_run(const char *filename) // Read the first chunk of input data. This is needed to detect // the input file type (for now, only for decompression). - strm.next_in = in_buf; - strm.avail_in = io_read(pair, in_buf, IO_BUFFER_SIZE); + strm.next_in = in_buf.u8; + strm.avail_in = io_read(pair, &in_buf, IO_BUFFER_SIZE); switch (coder_init(pair)) { case CODER_INIT_NORMAL: diff --git a/src/xz/file_io.c b/src/xz/file_io.c index b79d0b7..be5db73 100644 --- a/src/xz/file_io.c +++ b/src/xz/file_io.c @@ -37,6 +37,17 @@ static bool warn_fchown; #endif +/// If true, try to create sparse files when decompressing. +static bool try_sparse = true; + +/// File status flags of standard output. This is used by io_open_dest() +/// and io_close_dest(). +static int stdout_flags = 0; + + +static bool io_write_buf(file_pair *pair, const uint8_t *buf, size_t size); + + extern void io_init(void) { @@ -63,6 +74,14 @@ io_init(void) } +extern void +io_no_sparse(void) +{ + try_sparse = false; + return; +} + + /// \brief Unlink a file /// /// This tries to verify that the file being unlinked really is the file that @@ -498,42 +517,42 @@ io_open_dest(file_pair *pair) #ifdef TUKLIB_DOSLIKE setmode(STDOUT_FILENO, O_BINARY); #endif - return false; - } - - pair->dest_name = suffix_get_dest_name(pair->src_name); - if (pair->dest_name == NULL) - return true; + } else { + pair->dest_name = suffix_get_dest_name(pair->src_name); + if (pair->dest_name == NULL) + return true; - // If --force was used, unlink the target file first. - if (opt_force && unlink(pair->dest_name) && errno != ENOENT) { - message_error("%s: Cannot unlink: %s", - pair->dest_name, strerror(errno)); - free(pair->dest_name); - return true; - } + // If --force was used, unlink the target file first. + if (opt_force && unlink(pair->dest_name) && errno != ENOENT) { + message_error("%s: Cannot unlink: %s", + pair->dest_name, strerror(errno)); + free(pair->dest_name); + return true; + } - if (opt_force && unlink(pair->dest_name) && errno != ENOENT) { - message_error("%s: Cannot unlink: %s", pair->dest_name, - strerror(errno)); - free(pair->dest_name); - return true; - } + if (opt_force && unlink(pair->dest_name) && errno != ENOENT) { + message_error("%s: Cannot unlink: %s", + pair->dest_name, strerror(errno)); + free(pair->dest_name); + return true; + } - // Open the file. - const int flags = O_WRONLY | O_BINARY | O_NOCTTY | O_CREAT | O_EXCL; - const mode_t mode = S_IRUSR | S_IWUSR; - pair->dest_fd = open(pair->dest_name, flags, mode); + // Open the file. + const int flags = O_WRONLY | O_BINARY | O_NOCTTY + | O_CREAT | O_EXCL; + const mode_t mode = S_IRUSR | S_IWUSR; + pair->dest_fd = open(pair->dest_name, flags, mode); - if (pair->dest_fd == -1) { - // Don't bother with error message if user requested - // us to exit anyway. - if (!user_abort) - message_error("%s: %s", pair->dest_name, - strerror(errno)); + if (pair->dest_fd == -1) { + // Don't bother with error message if user requested + // us to exit anyway. + if (!user_abort) + message_error("%s: %s", pair->dest_name, + strerror(errno)); - free(pair->dest_name); - return true; + free(pair->dest_name); + return true; + } } // If this really fails... well, we have a safe fallback. @@ -545,6 +564,65 @@ io_open_dest(file_pair *pair) #elif !defined(TUKLIB_DOSLIKE) pair->dest_st.st_dev = 0; pair->dest_st.st_ino = 0; +#endif +#ifndef TUKLIB_DOSLIKE + } else if (try_sparse && opt_mode == MODE_DECOMPRESS) { + // When writing to standard output, we need to be extra + // careful: + // - It may be connected to something else than + // a regular file. + // - We aren't necessarily writing to a new empty file + // or to the end of an existing file. + // - O_APPEND may be active. + // + // TODO: I'm keeping this disabled for DOS-like systems + // for now. FAT doesn't support sparse files, but NTFS + // does, so maybe this should be enabled on Windows after + // some testing. + if (pair->dest_fd == STDOUT_FILENO) { + if (!S_ISREG(pair->dest_st.st_mode)) + return false; + + const int flags = fcntl(STDOUT_FILENO, F_GETFL); + if (flags == -1) + return false; + + if (flags & O_APPEND) { + // Creating a sparse file is not possible + // when O_APPEND is active (it's used by + // shell's >> redirection). As I understand + // it, it is safe to temporarily disable + // O_APPEND in xz, because if someone + // happened to write to the same file at the + // same time, results would be bad anyway + // (users shouldn't assume that xz uses any + // specific block size when writing data). + // + // The write position may be something else + // than the end of the file, so we must fix + // it to start writing at the end of the file + // to imitate O_APPEND. + if (lseek(STDOUT_FILENO, 0, SEEK_END) == -1) + return false; + + if (fcntl(STDOUT_FILENO, F_SETFL, + stdout_flags & ~O_APPEND)) + return false; + + // Remember the flags so that io_close_dest() + // can restore them. + stdout_flags = flags; + + } else if (lseek(STDOUT_FILENO, 0, SEEK_CUR) + != pair->dest_st.st_size) { + // Writing won't start exactly at the end + // of the file. We cannot use sparse output, + // because it would probably corrupt the file. + return false; + } + } + + pair->dest_try_sparse = true; #endif } @@ -562,6 +640,21 @@ io_open_dest(file_pair *pair) static int io_close_dest(file_pair *pair, bool success) { + // If io_open_dest() has disabled O_APPEND, restore it here. + if (stdout_flags != 0) { + assert(pair->dest_fd == STDOUT_FILENO); + + const int fail = fcntl(STDOUT_FILENO, F_SETFL, stdout_flags); + stdout_flags = 0; + + if (fail) { + message_error(_("Error restoring the O_APPEND flag " + "to standard output: %s"), + strerror(errno)); + return -1; + } + } + if (pair->dest_fd == -1 || pair->dest_fd == STDOUT_FILENO) return 0; @@ -603,6 +696,8 @@ io_open(const char *src_name) .src_fd = -1, .dest_fd = -1, .src_eof = false, + .dest_try_sparse = false, + .dest_pending_sparse = 0, }; // Block the signals, for which we have a custom signal handler, so @@ -629,6 +724,29 @@ io_open(const char *src_name) extern void io_close(file_pair *pair, bool success) { + // Take care of sparseness at the end of the output file. + if (success && pair->dest_try_sparse + && pair->dest_pending_sparse > 0) { + // Seek forward one byte less than the size of the pending + // hole, then write one zero-byte. This way the file grows + // to its correct size. An alternative would be to use + // ftruncate() but that isn't portable enough (e.g. it + // doesn't work with FAT on Linux; FAT isn't that important + // since it doesn't support sparse files anyway, but we don't + // want to create corrupt files on it). + if (lseek(pair->dest_fd, pair->dest_pending_sparse - 1, + SEEK_CUR) == -1) { + message_error(_("%s: Seeking failed when trying " + "to create a sparse file: %s"), + pair->dest_name, strerror(errno)); + success = false; + } else { + const uint8_t zero[1] = { '\0' }; + if (io_write_buf(pair, zero, 1)) + success = false; + } + } + signals_block(); if (success && pair->dest_fd != STDOUT_FILENO) @@ -651,11 +769,12 @@ io_close(file_pair *pair, bool success) extern size_t -io_read(file_pair *pair, uint8_t *buf, size_t size) +io_read(file_pair *pair, io_buf *buf_union, size_t size) { // We use small buffers here. assert(size < SSIZE_MAX); + uint8_t *buf = buf_union->u8; size_t left = size; while (left > 0) { @@ -691,8 +810,21 @@ io_read(file_pair *pair, uint8_t *buf, size_t size) } -extern bool -io_write(const file_pair *pair, const uint8_t *buf, size_t size) +static bool +is_sparse(const io_buf *buf) +{ + assert(IO_BUFFER_SIZE % sizeof(uint64_t) == 0); + + for (size_t i = 0; i < ARRAY_SIZE(buf->u64); ++i) + if (buf->u64[i] != 0) + return false; + + return true; +} + + +static bool +io_write_buf(file_pair *pair, const uint8_t *buf, size_t size) { assert(size < SSIZE_MAX); @@ -731,3 +863,46 @@ io_write(const file_pair *pair, const uint8_t *buf, size_t size) return false; } + + +extern bool +io_write(file_pair *pair, const io_buf *buf, size_t size) +{ + assert(size <= IO_BUFFER_SIZE); + + if (pair->dest_try_sparse) { + // Check if the block is sparse (contains only zeros). If it + // sparse, we just store the amount and return. We will take + // care of actually skipping over the hole when we hit the + // next data block or close the file. + // + // Since io_close() requires that dest_pending_sparse > 0 + // if the file ends with sparse block, we must also return + // if size == 0 to avoid doing the lseek(). + if (size == IO_BUFFER_SIZE) { + if (is_sparse(buf)) { + pair->dest_pending_sparse += size; + return false; + } + } else if (size == 0) { + return false; + } + + // This is not a sparse block. If we have a pending hole, + // skip it now. + if (pair->dest_pending_sparse > 0) { + if (lseek(pair->dest_fd, pair->dest_pending_sparse, + SEEK_CUR) == -1) { + message_error(_("%s: Seeking failed when " + "trying to create a sparse " + "file: %s"), pair->dest_name, + strerror(errno)); + return true; + } + + pair->dest_pending_sparse = 0; + } + } + + return io_write_buf(pair, buf->u8, size); +} diff --git a/src/xz/file_io.h b/src/xz/file_io.h index b0bbe11..58bf7b5 100644 --- a/src/xz/file_io.h +++ b/src/xz/file_io.h @@ -11,13 +11,22 @@ /////////////////////////////////////////////////////////////////////////////// // Some systems have suboptimal BUFSIZ. Use a bit bigger value on them. +// We also need that IO_BUFFER_SIZE is a multiple of 8 (sizeof(uint64_t)) #if BUFSIZ <= 1024 # define IO_BUFFER_SIZE 8192 #else -# define IO_BUFFER_SIZE BUFSIZ +# define IO_BUFFER_SIZE (BUFSIZ & ~7U) #endif +/// is_sparse() accesses the buffer as uint64_t for maximum speed. +/// Use an union to make sure that the buffer is properly aligned. +typedef union { + uint8_t u8[IO_BUFFER_SIZE]; + uint64_t u64[IO_BUFFER_SIZE / sizeof(uint64_t)]; +} io_buf; + + typedef struct { /// Name of the source filename (as given on the command line) or /// pointer to static "(stdin)" when reading from standard input. @@ -33,15 +42,24 @@ typedef struct { /// File descriptor of the target file int dest_fd; + /// True once end of the source file has been detected. + bool src_eof; + + /// If true, we look for long chunks of zeros and try to create + /// a sparse file. + bool dest_try_sparse; + + /// This is used only if dest_try_sparse is true. This holds the + /// number of zero bytes we haven't written out, because we plan + /// to make that byte range a sparse chunk. + off_t dest_pending_sparse; + /// Stat of the source file. struct stat src_st; /// Stat of the destination file. struct stat dest_st; - /// True once end of the source file has been detected. - bool src_eof; - } file_pair; @@ -49,6 +67,10 @@ typedef struct { extern void io_init(void); +/// \brief Disable creation of sparse files when decompressing +extern void io_no_sparse(void); + + /// \brief Opens a file pair extern file_pair *io_open(const char *src_name); @@ -72,7 +94,7 @@ extern void io_close(file_pair *pair, bool success); /// \return On success, number of bytes read is returned. On end of /// file zero is returned and pair->src_eof set to true. /// On error, SIZE_MAX is returned and error message printed. -extern size_t io_read(file_pair *pair, uint8_t *buf, size_t size); +extern size_t io_read(file_pair *pair, io_buf *buf, size_t size); /// \brief Writes a buffer to the destination file @@ -83,4 +105,4 @@ extern size_t io_read(file_pair *pair, uint8_t *buf, size_t size); /// /// \return On success, zero is returned. On error, -1 is returned /// and error message printed. -extern bool io_write(const file_pair *pair, const uint8_t *buf, size_t size); +extern bool io_write(file_pair *pair, const io_buf *buf, size_t size); diff --git a/src/xz/message.c b/src/xz/message.c index be7c3fa..4f8ca00 100644 --- a/src/xz/message.c +++ b/src/xz/message.c @@ -1072,6 +1072,7 @@ message_help(bool long_help) if (long_help) puts(_( +" --no-sparse do not create sparse files when decompressing\n" " -S, --suffix=.SUF use the suffix `.SUF' on compressed files\n" " --files=[FILE] read filenames to process from FILE; if FILE is\n" " omitted, filenames are read from the standard input;\n" diff --git a/src/xz/xz.1 b/src/xz/xz.1 index b811562..94aa562 100644 --- a/src/xz/xz.1 +++ b/src/xz/xz.1 @@ -336,6 +336,17 @@ Write the compressed or decompressed data to standard output instead of a file. This implies .BR \-\-keep . .TP +.B \-\-no\-sparse +Disable creation of sparse files. By default, if decompressing into +a regular file, +.B xz +tries to make the file sparse if the decompressed data contains long +sequences of binary zeros. It works also when writing to standard output +as long as standard output is connected to a regular file, and certain +additional conditions are met to make it safe. Creating sparse files may +save disk space and speed up the decompression by reducing the amount of +disk I/O. +.TP \fB\-S\fR \fI.suf\fR, \fB\-\-suffix=\fI.suf When compressing, use .I .suf -- 2.39.2