1 ///////////////////////////////////////////////////////////////////////////////
3 /// \file lzma_encoder.c
4 /// \brief LZMA encoder
6 // Copyright (C) 1999-2006 Igor Pavlov
7 // Copyright (C) 2007 Lasse Collin
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
19 ///////////////////////////////////////////////////////////////////////////////
21 // NOTE: If you want to keep the line length in 80 characters, set
22 // tab width to 4 or less in your editor when editing this file.
25 #include "lzma_encoder_private.h"
32 // These are as macros mostly because they use local range encoder variables.
34 #define literal_encode(subcoder, symbol) \
36 uint32_t context = 1; \
40 const uint32_t bit = ((symbol) >> i) & 1; \
41 bit_encode(subcoder[context], bit); \
42 context = (context << 1) | bit; \
47 #define literal_encode_matched(subcoder, match_byte, symbol) \
49 uint32_t context = 1; \
53 uint32_t bit = ((symbol) >> i) & 1; \
54 const uint32_t match_bit = ((match_byte) >> i) & 1; \
55 const uint32_t subcoder_index = 0x100 + (match_bit << 8) + context; \
56 bit_encode(subcoder[subcoder_index], bit); \
57 context = (context << 1) | bit; \
58 if (match_bit != bit) { \
61 bit = ((symbol) >> i) & 1; \
62 bit_encode(subcoder[context], bit); \
63 context = (context << 1) | bit; \
71 #define length_encode(length_encoder, symbol, pos_state, update_price) \
74 if ((symbol) < LEN_LOW_SYMBOLS) { \
75 bit_encode_0((length_encoder).choice); \
76 bittree_encode((length_encoder).low[pos_state], \
77 LEN_LOW_BITS, symbol); \
79 bit_encode_1((length_encoder).choice); \
80 if ((symbol) < LEN_LOW_SYMBOLS + LEN_MID_SYMBOLS) { \
81 bit_encode_0((length_encoder).choice2); \
82 bittree_encode((length_encoder).mid[pos_state], \
84 (symbol) - LEN_LOW_SYMBOLS); \
86 bit_encode_1((length_encoder).choice2); \
87 bittree_encode((length_encoder).high, LEN_HIGH_BITS, \
88 (symbol) - LEN_LOW_SYMBOLS \
93 if (--(length_encoder).counters[pos_state] == 0) \
94 lzma_length_encoder_update_table(&(length_encoder), pos_state); \
102 /// \brief Updates price table of the length encoder
104 /// Like all the other prices in LZMA, these are used by lzma_get_optimum().
107 lzma_length_encoder_update_table(lzma_length_encoder *lencoder,
108 const uint32_t pos_state)
110 const uint32_t num_symbols = lencoder->table_size;
111 const uint32_t a0 = bit_get_price_0(lencoder->choice);
112 const uint32_t a1 = bit_get_price_1(lencoder->choice);
113 const uint32_t b0 = a1 + bit_get_price_0(lencoder->choice2);
114 const uint32_t b1 = a1 + bit_get_price_1(lencoder->choice2);
116 uint32_t *prices = lencoder->prices[pos_state];
119 for (i = 0; i < num_symbols && i < LEN_LOW_SYMBOLS; ++i)
120 prices[i] = a0 + bittree_get_price(lencoder->low[pos_state],
123 for (; i < num_symbols && i < LEN_LOW_SYMBOLS + LEN_MID_SYMBOLS; ++i)
124 prices[i] = b0 + bittree_get_price(lencoder->mid[pos_state],
125 LEN_MID_BITS, i - LEN_LOW_SYMBOLS);
127 for (; i < num_symbols; ++i)
128 prices[i] = b1 + bittree_get_price(
129 lencoder->high, LEN_HIGH_BITS,
130 i - LEN_LOW_SYMBOLS - LEN_MID_SYMBOLS);
132 lencoder->counters[pos_state] = num_symbols;
139 * \brief LZMA encoder
141 * \return true if end of stream was reached, false otherwise.
144 lzma_lzma_encode(lzma_coder *coder, uint8_t *restrict out,
145 size_t *restrict out_pos, size_t out_size)
147 #define rc_buffer coder->lz.temp
148 #define rc_buffer_size coder->lz.temp_size
151 lzma_range_encoder rc = coder->rc;
152 size_t out_pos_local = *out_pos;
153 const uint32_t pos_mask = coder->pos_mask;
154 const bool best_compression = coder->best_compression;
156 // Initialize the stream if no data has been encoded yet.
157 if (!coder->is_initialized) {
158 if (coder->lz.read_pos == coder->lz.read_limit) {
159 switch (coder->lz.sequence) {
161 // Cannot initialize, because there is
166 // Nothing to flush. There cannot be a flush
167 // marker when no data has been processed
168 // yet (file format doesn't allow it, and
169 // it would be just waste of space).
173 // We are encoding an empty file. No need
174 // to initialize the encoder.
175 assert(coder->lz.write_pos == coder->lz.read_pos);
179 // We never get here.
185 // Do the actual initialization.
187 uint32_t num_distance_pairs;
188 lzma_read_match_distances(coder, &len, &num_distance_pairs);
190 bit_encode_0(coder->is_match[coder->state][0]);
191 update_char(coder->state);
193 const uint8_t cur_byte = coder->lz.buffer[
194 coder->lz.read_pos - coder->additional_offset];
195 probability *subcoder = literal_get_subcoder(coder->literal_coder,
196 coder->now_pos, coder->previous_byte);
197 literal_encode(subcoder, cur_byte);
199 coder->previous_byte = cur_byte;
200 --coder->additional_offset;
203 assert(coder->additional_offset == 0);
206 // Initialization is done (except if empty file).
207 coder->is_initialized = true;
212 // Check that there is free output space.
213 if (out_pos_local == out_size)
216 assert(rc_buffer_size == 0);
218 // Check that there is some input to process.
219 if (coder->lz.read_pos >= coder->lz.read_limit) {
220 // If flushing or finishing, we must keep encoding
221 // until additional_offset becomes zero to make
222 // all the input available at output.
223 if (coder->lz.sequence == SEQ_RUN
224 || coder->additional_offset == 0)
228 assert(coder->lz.read_pos <= coder->lz.write_pos);
231 if (coder->lz.sequence != SEQ_RUN) {
232 assert(coder->lz.read_limit == coder->lz.write_pos);
234 assert(coder->lz.read_limit + coder->lz.keep_size_after
235 == coder->lz.write_pos);
239 const uint32_t pos_state = coder->now_pos & pos_mask;
244 // Get optimal match (repeat position and length).
245 // Value ranges for pos:
246 // - [0, REP_DISTANCES): repeated match
247 // - [REP_DISTANCES, UINT32_MAX): match at (pos - REP_DISTANCES)
248 // - UINT32_MAX: not a match but a literal
249 // Value ranges for len:
250 // - [MATCH_MIN_LEN, MATCH_MAX_LEN]
251 if (best_compression)
252 lzma_get_optimum(coder, &pos, &len);
254 lzma_get_optimum_fast(coder, &pos, &len);
256 if (len == 1 && pos == UINT32_MAX) {
258 bit_encode_0(coder->is_match[coder->state][pos_state]);
260 const uint8_t cur_byte = coder->lz.buffer[
261 coder->lz.read_pos - coder->additional_offset];
262 probability *subcoder = literal_get_subcoder(coder->literal_coder,
263 coder->now_pos, coder->previous_byte);
265 if (is_char_state(coder->state)) {
266 literal_encode(subcoder, cur_byte);
268 const uint8_t match_byte = coder->lz.buffer[
270 - coder->rep_distances[0] - 1
271 - coder->additional_offset];
272 literal_encode_matched(subcoder, match_byte, cur_byte);
275 update_char(coder->state);
276 coder->previous_byte = cur_byte;
280 bit_encode_1(coder->is_match[coder->state][pos_state]);
282 if (pos < REP_DISTANCES) {
283 // It's a repeated match i.e. the same distance
284 // has been used earlier.
285 bit_encode_1(coder->is_rep[coder->state]);
288 bit_encode_0(coder->is_rep0[coder->state]);
289 const uint32_t symbol = (len == 1) ? 0 : 1;
290 bit_encode(coder->is_rep0_long[coder->state][pos_state],
293 const uint32_t distance = coder->rep_distances[pos];
294 bit_encode_1(coder->is_rep0[coder->state]);
297 bit_encode_0(coder->is_rep1[coder->state]);
299 bit_encode_1(coder->is_rep1[coder->state]);
300 bit_encode(coder->is_rep2[coder->state], pos - 2);
303 coder->rep_distances[3] = coder->rep_distances[2];
305 coder->rep_distances[2] = coder->rep_distances[1];
308 coder->rep_distances[1] = coder->rep_distances[0];
309 coder->rep_distances[0] = distance;
313 update_short_rep(coder->state);
315 length_encode(coder->rep_match_len_encoder,
316 len - MATCH_MIN_LEN, pos_state,
318 update_rep(coder->state);
322 bit_encode_0(coder->is_rep[coder->state]);
323 update_match(coder->state);
324 length_encode(coder->len_encoder, len - MATCH_MIN_LEN,
325 pos_state, best_compression);
326 pos -= REP_DISTANCES;
328 const uint32_t pos_slot = get_pos_slot(pos);
329 const uint32_t len_to_pos_state = get_len_to_pos_state(len);
330 bittree_encode(coder->pos_slot_encoder[len_to_pos_state],
331 POS_SLOT_BITS, pos_slot);
333 if (pos_slot >= START_POS_MODEL_INDEX) {
334 const uint32_t footer_bits = (pos_slot >> 1) - 1;
335 const uint32_t base = (2 | (pos_slot & 1)) << footer_bits;
336 const uint32_t pos_reduced = pos - base;
338 if (pos_slot < END_POS_MODEL_INDEX) {
339 bittree_reverse_encode(
340 coder->pos_encoders + base - pos_slot - 1,
341 footer_bits, pos_reduced);
343 rc_encode_direct_bits(pos_reduced >> ALIGN_BITS,
344 footer_bits - ALIGN_BITS);
345 bittree_reverse_encode(coder->pos_align_encoder,
346 ALIGN_BITS, pos_reduced & ALIGN_MASK);
347 ++coder->align_price_count;
351 coder->rep_distances[3] = coder->rep_distances[2];
352 coder->rep_distances[2] = coder->rep_distances[1];
353 coder->rep_distances[1] = coder->rep_distances[0];
354 coder->rep_distances[0] = pos;
355 ++coder->match_price_count;
358 coder->previous_byte = coder->lz.buffer[
359 coder->lz.read_pos + len - 1
360 - coder->additional_offset];
363 assert(coder->additional_offset >= len);
364 coder->additional_offset -= len;
365 coder->now_pos += len;
368 // Check if everything is done.
369 bool all_done = false;
370 if (coder->lz.sequence != SEQ_RUN
371 && coder->lz.read_pos == coder->lz.write_pos
372 && coder->additional_offset == 0) {
373 if (coder->lz.uncompressed_size == LZMA_VLI_VALUE_UNKNOWN
374 || coder->lz.sequence == SEQ_FLUSH) {
375 // Write special marker: flush marker or end of payload
376 // marker. Both are encoded as a match with distance of
377 // UINT32_MAX. The match length codes the type of the marker.
378 const uint32_t pos_state = coder->now_pos & pos_mask;
379 bit_encode_1(coder->is_match[coder->state][pos_state]);
380 bit_encode_0(coder->is_rep[coder->state]);
381 update_match(coder->state);
383 const uint32_t len = coder->lz.sequence == SEQ_FLUSH
384 ? LEN_SPECIAL_FLUSH : LEN_SPECIAL_EOPM;
385 length_encode(coder->len_encoder, len - MATCH_MIN_LEN,
386 pos_state, best_compression);
388 const uint32_t pos_slot = (1 << POS_SLOT_BITS) - 1;
389 const uint32_t len_to_pos_state = get_len_to_pos_state(len);
390 bittree_encode(coder->pos_slot_encoder[len_to_pos_state],
391 POS_SLOT_BITS, pos_slot);
393 const uint32_t footer_bits = 30;
394 const uint32_t pos_reduced
395 = (UINT32_C(1) << footer_bits) - 1;
396 rc_encode_direct_bits(pos_reduced >> ALIGN_BITS,
397 footer_bits - ALIGN_BITS);
399 bittree_reverse_encode(coder->pos_align_encoder, ALIGN_BITS,
400 pos_reduced & ALIGN_MASK);
403 // Flush the last bytes of compressed data from
404 // the range coder to the output buffer.
409 // All done. Note that some output bytes might be
410 // pending in coder->lz.temp. lzma_lz_encode() will
411 // take care of those bytes.
415 // Store local variables back to *coder.
417 *out_pos = out_pos_local;