1 ///////////////////////////////////////////////////////////////////////////////
4 /// \brief LZ in window
6 // Copyright (C) 1999-2006 Igor Pavlov
7 // Copyright (C) 2007 Lasse Collin
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
19 ///////////////////////////////////////////////////////////////////////////////
21 #include "lz_encoder_private.h"
43 /// This is needed in two places so provide a macro.
44 #define get_cyclic_buffer_size(history_size) ((history_size) + 1)
47 /// Calculate certain match finder properties and validate the calculated
48 /// values. This is as its own function, because *num_items is needed to
49 /// calculate memory requirements in common/memory.c.
51 lzma_lz_encoder_hash_properties(lzma_match_finder match_finder,
52 uint32_t history_size, uint32_t *restrict hash_mask,
53 uint32_t *restrict hash_size_sum, uint32_t *restrict num_items)
55 uint32_t fix_hash_size;
58 switch (match_finder) {
61 fix_hash_size = LZMA_HC3_FIX_HASH_SIZE;
67 fix_hash_size = LZMA_HC4_FIX_HASH_SIZE;
73 fix_hash_size = LZMA_BT2_FIX_HASH_SIZE;
79 fix_hash_size = LZMA_BT3_FIX_HASH_SIZE;
85 fix_hash_size = LZMA_BT4_FIX_HASH_SIZE;
96 if (match_finder == LZMA_BT2) {
97 // NOTE: hash_mask is not used by the BT2 match finder,
98 // but it is initialized just in case.
99 hs = LZMA_BT2_HASH_SIZE;
104 hs = history_size - 1;
112 if (hs > (UINT32_C(1) << 24)) {
113 if (match_finder == LZMA_MF_HC4
114 || match_finder == LZMA_MF_BT4)
124 *hash_size_sum = hs + fix_hash_size;
126 *num_items = *hash_size_sum
127 + get_cyclic_buffer_size(history_size) * sons;
134 lzma_lz_encoder_reset(lzma_lz_encoder *lz, lzma_allocator *allocator,
135 bool (*process)(lzma_coder *coder, uint8_t *restrict out,
136 size_t *restrict out_pos, size_t out_size),
137 size_t history_size, size_t additional_buffer_before,
138 size_t match_max_len, size_t additional_buffer_after,
139 lzma_match_finder match_finder, uint32_t match_finder_cycles,
140 const uint8_t *preset_dictionary,
141 size_t preset_dictionary_size)
143 lz->sequence = SEQ_RUN;
149 // Validate history size.
150 if (history_size < LZMA_DICTIONARY_SIZE_MIN
151 || history_size > LZMA_DICTIONARY_SIZE_MAX) {
152 lzma_lz_encoder_end(lz, allocator);
153 return LZMA_HEADER_ERROR;
156 assert(history_size <= MAX_VAL_FOR_NORMALIZE - 256);
157 assert(LZMA_DICTIONARY_SIZE_MAX <= MAX_VAL_FOR_NORMALIZE - 256);
159 // Calculate the size of the history buffer to allocate.
160 // TODO: Get a reason for magic constant of 256.
161 const size_t size_reserv = (history_size + additional_buffer_before
162 + match_max_len + additional_buffer_after) / 2 + 256;
164 lz->keep_size_before = history_size + additional_buffer_before;
165 lz->keep_size_after = match_max_len + additional_buffer_after;
167 const size_t buffer_size = lz->keep_size_before + lz->keep_size_after
170 // Allocate history buffer if its size has changed.
171 if (buffer_size != lz->size) {
172 lzma_free(lz->buffer, allocator);
173 lz->buffer = lzma_alloc(buffer_size, allocator);
174 if (lz->buffer == NULL) {
175 lzma_lz_encoder_end(lz, allocator);
176 return LZMA_MEM_ERROR;
180 // Allocation successful. Store the new size.
181 lz->size = buffer_size;
183 // Reset in window variables.
195 // Validate match_finder, set function pointers and a few match
196 // finder specific variables.
197 switch (match_finder) {
200 lz->get_matches = &lzma_hc3_get_matches;
201 lz->skip = &lzma_hc3_skip;
202 lz->cut_value = 8 + (match_max_len >> 2);
207 lz->get_matches = &lzma_hc4_get_matches;
208 lz->skip = &lzma_hc4_skip;
209 lz->cut_value = 8 + (match_max_len >> 2);
214 lz->get_matches = &lzma_bt2_get_matches;
215 lz->skip = &lzma_bt2_skip;
216 lz->cut_value = 16 + (match_max_len >> 1);
221 lz->get_matches = &lzma_bt3_get_matches;
222 lz->skip = &lzma_bt3_skip;
223 lz->cut_value = 16 + (match_max_len >> 1);
228 lz->get_matches = &lzma_bt4_get_matches;
229 lz->skip = &lzma_bt4_skip;
230 lz->cut_value = 16 + (match_max_len >> 1);
234 lzma_lz_encoder_end(lz, allocator);
235 return LZMA_HEADER_ERROR;
238 // Check if we have been requested to use a non-default cut_value.
239 if (match_finder_cycles > 0)
240 lz->cut_value = match_finder_cycles;
242 lz->match_max_len = match_max_len;
243 lz->cyclic_buffer_size = get_cyclic_buffer_size(history_size);
245 uint32_t hash_size_sum;
247 if (lzma_lz_encoder_hash_properties(match_finder, history_size,
248 &lz->hash_mask, &hash_size_sum, &num_items)) {
249 lzma_lz_encoder_end(lz, allocator);
250 return LZMA_HEADER_ERROR;
253 if (num_items != lz->num_items) {
254 #if UINT32_MAX >= SIZE_MAX / 4
255 // Check for integer overflow. (Huge dictionaries are not
256 // possible on 32-bit CPU.)
257 if (num_items > SIZE_MAX / sizeof(uint32_t)) {
258 lzma_lz_encoder_end(lz, allocator);
259 return LZMA_MEM_ERROR;
263 const size_t size_in_bytes
264 = (size_t)(num_items) * sizeof(uint32_t);
266 lzma_free(lz->hash, allocator);
267 lz->hash = lzma_alloc(size_in_bytes, allocator);
268 if (lz->hash == NULL) {
269 lzma_lz_encoder_end(lz, allocator);
270 return LZMA_MEM_ERROR;
273 lz->num_items = num_items;
276 lz->son = lz->hash + hash_size_sum;
278 // Reset the hash table to empty hash values.
280 uint32_t *restrict items = lz->hash;
282 for (uint32_t i = 0; i < hash_size_sum; ++i)
283 items[i] = EMPTY_HASH_VALUE;
286 lz->cyclic_buffer_pos = 0;
288 // Because zero is used as empty hash value, make the first byte
289 // appear at buffer[1 - offset].
292 // If we are using a preset dictionary, read it now.
293 // TODO: This isn't implemented yet so return LZMA_HEADER_ERROR.
294 if (preset_dictionary != NULL && preset_dictionary_size > 0) {
295 lzma_lz_encoder_end(lz, allocator);
296 return LZMA_HEADER_ERROR;
299 // Set the process function pointer.
300 lz->process = process;
307 lzma_lz_encoder_end(lzma_lz_encoder *lz, lzma_allocator *allocator)
309 lzma_free(lz->hash, allocator);
313 lzma_free(lz->buffer, allocator);
321 /// \brief Moves the data in the input window to free space for new data
323 /// lz->buffer is a sliding input window, which keeps lz->keep_size_before
324 /// bytes of input history available all the time. Now and then we need to
325 /// "slide" the buffer to make space for the new data to the end of the
326 /// buffer. At the same time, data older than keep_size_before is dropped.
329 move_window(lzma_lz_encoder *lz)
331 // buffer[move_offset] will become buffer[0].
332 assert(lz->read_pos > lz->keep_size_after);
333 size_t move_offset = lz->read_pos - lz->keep_size_before;
335 // We need one additional byte, since move_pos() moves on 1 byte.
336 // TODO: Clean up? At least document more.
340 assert(lz->write_pos > move_offset);
341 const size_t move_size = lz->write_pos - move_offset;
343 assert(move_offset + move_size <= lz->size);
345 memmove(lz->buffer, lz->buffer + move_offset, move_size);
347 lz->offset += move_offset;
348 lz->read_pos -= move_offset;
349 lz->read_limit -= move_offset;
350 lz->write_pos -= move_offset;
356 /// \brief Tries to fill the input window (lz->buffer)
358 /// If we are the last encoder in the chain, our input data is in in[].
359 /// Otherwise we call the next filter in the chain to process in[] and
360 /// write its output to lz->buffer.
362 /// This function must not be called once it has returned LZMA_STREAM_END.
365 fill_window(lzma_coder *coder, lzma_allocator *allocator, const uint8_t *in,
366 size_t *in_pos, size_t in_size, lzma_action action)
368 assert(coder->lz.read_pos <= coder->lz.write_pos);
370 // Move the sliding window if needed.
371 if (coder->lz.read_pos >= coder->lz.size - coder->lz.keep_size_after)
372 move_window(&coder->lz);
376 if (coder->next.code == NULL) {
377 // Not using a filter, simply memcpy() as much as possible.
378 in_used = bufcpy(in, in_pos, in_size, coder->lz.buffer,
379 &coder->lz.write_pos, coder->lz.size);
381 if (action != LZMA_RUN && *in_pos == in_size)
382 ret = LZMA_STREAM_END;
387 const size_t in_start = *in_pos;
388 ret = coder->next.code(coder->next.coder, allocator,
390 coder->lz.buffer, &coder->lz.write_pos,
391 coder->lz.size, action);
392 in_used = *in_pos - in_start;
395 // If end of stream has been reached or flushing completed, we allow
396 // the encoder to process all the input (that is, read_pos is allowed
397 // to reach write_pos). Otherwise we keep keep_size_after bytes
398 // available as prebuffer.
399 if (ret == LZMA_STREAM_END) {
400 assert(*in_pos == in_size);
401 coder->lz.read_limit = coder->lz.write_pos;
405 case LZMA_SYNC_FLUSH:
406 coder->lz.sequence = SEQ_FLUSH;
410 coder->lz.sequence = SEQ_FINISH;
415 ret = LZMA_PROG_ERROR;
419 } else if (coder->lz.write_pos > coder->lz.keep_size_after) {
420 // This needs to be done conditionally, because if we got
421 // only little new input, there may be too little input
422 // to do any encoding yet.
423 coder->lz.read_limit = coder->lz.write_pos
424 - coder->lz.keep_size_after;
427 // Restart the match finder after finished LZMA_SYNC_FLUSH.
428 if (coder->lz.pending > 0
429 && coder->lz.read_pos < coder->lz.read_limit) {
430 // Match finder may update coder->pending and expects it to
431 // start from zero, so use a temporary variable.
432 const size_t pending = coder->lz.pending;
433 coder->lz.pending = 0;
435 // Rewind read_pos so that the match finder can hash
436 // the pending bytes.
437 assert(coder->lz.read_pos >= pending);
438 coder->lz.read_pos -= pending;
439 coder->lz.skip(&coder->lz, pending);
447 lzma_lz_encode(lzma_coder *coder, lzma_allocator *allocator,
448 const uint8_t *restrict in, size_t *restrict in_pos,
450 uint8_t *restrict out, size_t *restrict out_pos,
451 size_t out_size, lzma_action action)
453 while (*out_pos < out_size
454 && (*in_pos < in_size || action != LZMA_RUN)) {
455 // Read more data to coder->lz.buffer if needed.
456 if (coder->lz.sequence == SEQ_RUN
457 && coder->lz.read_pos >= coder->lz.read_limit)
458 return_if_error(fill_window(coder, allocator,
459 in, in_pos, in_size, action));
462 if (coder->lz.process(coder, out, out_pos, out_size)) {
463 // Setting this to SEQ_RUN for cases when we are
464 // flushing. It doesn't matter when finishing.
465 coder->lz.sequence = SEQ_RUN;
466 return action != LZMA_RUN ? LZMA_STREAM_END : LZMA_OK;
474 /// \brief Normalizes hash values
476 /// lzma_lz_normalize is called when lz->pos hits MAX_VAL_FOR_NORMALIZE,
477 /// which currently happens once every 2 GiB of input data (to be exact,
478 /// after the first 2 GiB it happens once every 2 GiB minus dictionary_size
479 /// bytes). lz->pos is incremented by lzma_lz_move_pos().
481 /// lz->hash contains big amount of offsets relative to lz->buffer.
482 /// The offsets are stored as uint32_t, which is the only reasonable
483 /// datatype for these offsets; uint64_t would waste far too much RAM
484 /// and uint16_t would limit the dictionary to 64 KiB (far too small).
486 /// When compressing files over 2 GiB, lz->buffer needs to be moved forward
487 /// to avoid integer overflows. We scan the lz->hash array and fix every
488 /// value to match the updated lz->buffer.
490 lzma_lz_encoder_normalize(lzma_lz_encoder *lz)
492 const uint32_t subvalue = lz->read_pos - lz->cyclic_buffer_size;
493 assert(subvalue <= INT32_MAX);
496 const uint32_t num_items = lz->num_items;
497 uint32_t *restrict items = lz->hash;
499 for (uint32_t i = 0; i < num_items; ++i) {
500 // If the distance is greater than the dictionary
501 // size, we can simply mark the item as empty.
502 if (items[i] <= subvalue)
503 items[i] = EMPTY_HASH_VALUE;
505 items[i] -= subvalue;
509 // Update offset to match the new locations.
510 lz->offset -= subvalue;