1 ///////////////////////////////////////////////////////////////////////////////
3 /// \file lzma_encoder.c
4 /// \brief LZMA encoder
6 // Copyright (C) 1999-2006 Igor Pavlov
7 // Copyright (C) 2007 Lasse Collin
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
19 ///////////////////////////////////////////////////////////////////////////////
21 // NOTE: If you want to keep the line length in 80 characters, set
22 // tab width to 4 or less in your editor when editing this file.
25 #include "lzma_encoder_private.h"
33 // These are as macros mostly because they use local range encoder variables.
35 #define literal_encode(subcoder, symbol) \
37 uint32_t context = 1; \
41 const uint32_t bit = ((symbol) >> i) & 1; \
42 bit_encode(subcoder[context], bit); \
43 context = (context << 1) | bit; \
48 #define literal_encode_matched(subcoder, match_byte, symbol) \
50 uint32_t context = 1; \
54 uint32_t bit = ((symbol) >> i) & 1; \
55 const uint32_t match_bit = ((match_byte) >> i) & 1; \
56 const uint32_t subcoder_index = 0x100 + (match_bit << 8) + context; \
57 bit_encode(subcoder[subcoder_index], bit); \
58 context = (context << 1) | bit; \
59 if (match_bit != bit) { \
62 bit = ((symbol) >> i) & 1; \
63 bit_encode(subcoder[context], bit); \
64 context = (context << 1) | bit; \
72 #define length_encode(length_encoder, symbol, pos_state, update_price) \
75 if ((symbol) < LEN_LOW_SYMBOLS) { \
76 bit_encode_0((length_encoder).choice); \
77 bittree_encode((length_encoder).low[pos_state], \
78 LEN_LOW_BITS, symbol); \
80 bit_encode_1((length_encoder).choice); \
81 if ((symbol) < LEN_LOW_SYMBOLS + LEN_MID_SYMBOLS) { \
82 bit_encode_0((length_encoder).choice2); \
83 bittree_encode((length_encoder).mid[pos_state], \
85 (symbol) - LEN_LOW_SYMBOLS); \
87 bit_encode_1((length_encoder).choice2); \
88 bittree_encode((length_encoder).high, LEN_HIGH_BITS, \
89 (symbol) - LEN_LOW_SYMBOLS \
94 if (--(length_encoder).counters[pos_state] == 0) \
95 lzma_length_encoder_update_table(&(length_encoder), pos_state); \
103 /// \brief Updates price table of the length encoder
105 /// Like all the other prices in LZMA, these are used by lzma_get_optimum().
108 lzma_length_encoder_update_table(lzma_length_encoder *lencoder,
109 const uint32_t pos_state)
111 const uint32_t num_symbols = lencoder->table_size;
112 const uint32_t a0 = bit_get_price_0(lencoder->choice);
113 const uint32_t a1 = bit_get_price_1(lencoder->choice);
114 const uint32_t b0 = a1 + bit_get_price_0(lencoder->choice2);
115 const uint32_t b1 = a1 + bit_get_price_1(lencoder->choice2);
117 uint32_t *prices = lencoder->prices[pos_state];
120 for (i = 0; i < num_symbols && i < LEN_LOW_SYMBOLS; ++i)
121 prices[i] = a0 + bittree_get_price(lencoder->low[pos_state],
124 for (; i < num_symbols && i < LEN_LOW_SYMBOLS + LEN_MID_SYMBOLS; ++i)
125 prices[i] = b0 + bittree_get_price(lencoder->mid[pos_state],
126 LEN_MID_BITS, i - LEN_LOW_SYMBOLS);
128 for (; i < num_symbols; ++i)
129 prices[i] = b1 + bittree_get_price(
130 lencoder->high, LEN_HIGH_BITS,
131 i - LEN_LOW_SYMBOLS - LEN_MID_SYMBOLS);
133 lencoder->counters[pos_state] = num_symbols;
140 * \brief LZMA encoder
142 * \return true if end of stream was reached, false otherwise.
145 lzma_lzma_encode(lzma_coder *coder, uint8_t *restrict out,
146 size_t *restrict out_pos, size_t out_size)
148 #define rc_buffer coder->lz.temp
149 #define rc_buffer_size coder->lz.temp_size
152 lzma_range_encoder rc = coder->rc;
153 size_t out_pos_local = *out_pos;
154 const uint32_t pos_mask = coder->pos_mask;
155 const bool best_compression = coder->best_compression;
157 // Initialize the stream if no data has been encoded yet.
158 if (!coder->is_initialized) {
159 if (coder->lz.read_pos == coder->lz.read_limit) {
160 if (coder->lz.sequence == SEQ_RUN)
161 return false; // We cannot do anything.
163 // We are finishing (we cannot get here when flushing).
164 assert(coder->lz.write_pos == coder->lz.read_pos);
165 assert(coder->lz.sequence == SEQ_FINISH);
167 // Do the actual initialization.
169 uint32_t num_distance_pairs;
170 lzma_read_match_distances(coder, &len, &num_distance_pairs);
172 bit_encode_0(coder->is_match[coder->state][0]);
173 update_char(coder->state);
175 const uint8_t cur_byte = coder->lz.buffer[
176 coder->lz.read_pos - coder->additional_offset];
177 probability *subcoder = literal_get_subcoder(coder->literal_coder,
178 coder->now_pos, coder->previous_byte);
179 literal_encode(subcoder, cur_byte);
181 coder->previous_byte = cur_byte;
182 --coder->additional_offset;
185 assert(coder->additional_offset == 0);
188 // Initialization is done (except if empty file).
189 coder->is_initialized = true;
194 // Check that there is free output space.
195 if (out_pos_local == out_size)
198 assert(rc_buffer_size == 0);
200 // Check that there is some input to process.
201 if (coder->lz.read_pos >= coder->lz.read_limit) {
202 // If flushing or finishing, we must keep encoding
203 // until additional_offset becomes zero to make
204 // all the input available at output.
205 if (coder->lz.sequence == SEQ_RUN
206 || coder->additional_offset == 0)
210 assert(coder->lz.read_pos <= coder->lz.write_pos);
213 if (coder->lz.sequence != SEQ_RUN) {
214 assert(coder->lz.read_limit == coder->lz.write_pos);
216 assert(coder->lz.read_limit + coder->lz.keep_size_after
217 == coder->lz.write_pos);
221 const uint32_t pos_state = coder->now_pos & pos_mask;
226 // Get optimal match (repeat position and length).
227 // Value ranges for pos:
228 // - [0, REP_DISTANCES): repeated match
229 // - [REP_DISTANCES, UINT32_MAX): match at (pos - REP_DISTANCES)
230 // - UINT32_MAX: not a match but a literal
231 // Value ranges for len:
232 // - [MATCH_MIN_LEN, MATCH_MAX_LEN]
233 if (best_compression)
234 lzma_get_optimum(coder, &pos, &len);
236 lzma_get_optimum_fast(coder, &pos, &len);
238 if (len == 1 && pos == UINT32_MAX) {
240 bit_encode_0(coder->is_match[coder->state][pos_state]);
242 const uint8_t cur_byte = coder->lz.buffer[
243 coder->lz.read_pos - coder->additional_offset];
244 probability *subcoder = literal_get_subcoder(coder->literal_coder,
245 coder->now_pos, coder->previous_byte);
247 if (is_char_state(coder->state)) {
248 literal_encode(subcoder, cur_byte);
250 const uint8_t match_byte = coder->lz.buffer[
252 - coder->rep_distances[0] - 1
253 - coder->additional_offset];
254 literal_encode_matched(subcoder, match_byte, cur_byte);
257 update_char(coder->state);
258 coder->previous_byte = cur_byte;
262 bit_encode_1(coder->is_match[coder->state][pos_state]);
264 if (pos < REP_DISTANCES) {
265 // It's a repeated match i.e. the same distance
266 // has been used earlier.
267 bit_encode_1(coder->is_rep[coder->state]);
270 bit_encode_0(coder->is_rep0[coder->state]);
271 const uint32_t symbol = (len == 1) ? 0 : 1;
272 bit_encode(coder->is_rep0_long[coder->state][pos_state],
275 const uint32_t distance = coder->rep_distances[pos];
276 bit_encode_1(coder->is_rep0[coder->state]);
279 bit_encode_0(coder->is_rep1[coder->state]);
281 bit_encode_1(coder->is_rep1[coder->state]);
282 bit_encode(coder->is_rep2[coder->state], pos - 2);
285 coder->rep_distances[3] = coder->rep_distances[2];
287 coder->rep_distances[2] = coder->rep_distances[1];
290 coder->rep_distances[1] = coder->rep_distances[0];
291 coder->rep_distances[0] = distance;
295 update_short_rep(coder->state);
297 length_encode(coder->rep_match_len_encoder,
298 len - MATCH_MIN_LEN, pos_state,
300 update_rep(coder->state);
304 bit_encode_0(coder->is_rep[coder->state]);
305 update_match(coder->state);
306 length_encode(coder->len_encoder, len - MATCH_MIN_LEN,
307 pos_state, best_compression);
308 pos -= REP_DISTANCES;
310 const uint32_t pos_slot = get_pos_slot(pos);
311 const uint32_t len_to_pos_state = get_len_to_pos_state(len);
312 bittree_encode(coder->pos_slot_encoder[len_to_pos_state],
313 POS_SLOT_BITS, pos_slot);
315 if (pos_slot >= START_POS_MODEL_INDEX) {
316 const uint32_t footer_bits = (pos_slot >> 1) - 1;
317 const uint32_t base = (2 | (pos_slot & 1)) << footer_bits;
318 const uint32_t pos_reduced = pos - base;
320 if (pos_slot < END_POS_MODEL_INDEX) {
321 bittree_reverse_encode(
322 coder->pos_encoders + base - pos_slot - 1,
323 footer_bits, pos_reduced);
325 rc_encode_direct_bits(pos_reduced >> ALIGN_BITS,
326 footer_bits - ALIGN_BITS);
327 bittree_reverse_encode(coder->pos_align_encoder,
328 ALIGN_BITS, pos_reduced & ALIGN_MASK);
329 ++coder->align_price_count;
333 coder->rep_distances[3] = coder->rep_distances[2];
334 coder->rep_distances[2] = coder->rep_distances[1];
335 coder->rep_distances[1] = coder->rep_distances[0];
336 coder->rep_distances[0] = pos;
337 ++coder->match_price_count;
340 coder->previous_byte = coder->lz.buffer[
341 coder->lz.read_pos + len - 1
342 - coder->additional_offset];
345 assert(coder->additional_offset >= len);
346 coder->additional_offset -= len;
347 coder->now_pos += len;
350 // Check if everything is done.
351 bool all_done = false;
352 if (coder->lz.sequence != SEQ_RUN
353 && coder->lz.read_pos == coder->lz.write_pos
354 && coder->additional_offset == 0) {
355 if (coder->lz.uncompressed_size == LZMA_VLI_VALUE_UNKNOWN
356 || coder->lz.sequence == SEQ_FLUSH) {
357 // Write special marker: flush marker or end of payload
358 // marker. Both are encoded as a match with distance of
359 // UINT32_MAX. The match length codes the type of the marker.
360 const uint32_t pos_state = coder->now_pos & pos_mask;
361 bit_encode_1(coder->is_match[coder->state][pos_state]);
362 bit_encode_0(coder->is_rep[coder->state]);
363 update_match(coder->state);
365 const uint32_t len = coder->lz.sequence == SEQ_FLUSH
366 ? LEN_SPECIAL_FLUSH : LEN_SPECIAL_EOPM;
367 length_encode(coder->len_encoder, len - MATCH_MIN_LEN,
368 pos_state, best_compression);
370 const uint32_t pos_slot = (1 << POS_SLOT_BITS) - 1;
371 const uint32_t len_to_pos_state = get_len_to_pos_state(len);
372 bittree_encode(coder->pos_slot_encoder[len_to_pos_state],
373 POS_SLOT_BITS, pos_slot);
375 const uint32_t footer_bits = 30;
376 const uint32_t pos_reduced
377 = (UINT32_C(1) << footer_bits) - 1;
378 rc_encode_direct_bits(pos_reduced >> ALIGN_BITS,
379 footer_bits - ALIGN_BITS);
381 bittree_reverse_encode(coder->pos_align_encoder, ALIGN_BITS,
382 pos_reduced & ALIGN_MASK);
385 // Flush the last bytes of compressed data from
386 // the range coder to the output buffer.
391 // All done. Note that some output bytes might be
392 // pending in coder->lz.temp. lzma_lz_encode() will
393 // take care of those bytes.
397 // Store local variables back to *coder.
399 *out_pos = out_pos_local;