1 ///////////////////////////////////////////////////////////////////////////////
3 /// \file lzma_encoder.c
4 /// \brief LZMA encoder
6 // Copyright (C) 1999-2006 Igor Pavlov
7 // Copyright (C) 2007 Lasse Collin
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
19 ///////////////////////////////////////////////////////////////////////////////
21 #include "lzma_encoder_private.h"
30 literal_normal(lzma_range_encoder *rc, probability *subcoder, uint32_t symbol)
33 uint32_t bit_count = 8; // Bits per byte
35 const uint32_t bit = (symbol >> --bit_count) & 1;
36 rc_bit(rc, &subcoder[context], bit);
37 context = (context << 1) | bit;
38 } while (bit_count != 0);
43 literal_matched(lzma_range_encoder *rc, probability *subcoder,
44 uint32_t match_byte, uint32_t symbol)
47 uint32_t bit_count = 8;
50 uint32_t bit = (symbol >> --bit_count) & 1;
51 const uint32_t match_bit = (match_byte >> bit_count) & 1;
52 rc_bit(rc, &subcoder[(0x100 << match_bit) + context], bit);
53 context = (context << 1) | bit;
55 if (match_bit != bit) {
56 // The bit from the literal being encoded and the bit
57 // from the previous match differ. Finish encoding
58 // as a normal literal.
59 while (bit_count != 0) {
60 bit = (symbol >> --bit_count) & 1;
61 rc_bit(rc, &subcoder[context], bit);
62 context = (context << 1) | bit;
68 } while (bit_count != 0);
73 literal(lzma_coder *coder)
75 // Locate the literal byte to be encoded and the subcoder.
76 const uint8_t cur_byte = coder->lz.buffer[
77 coder->lz.read_pos - coder->additional_offset];
78 probability *subcoder = literal_get_subcoder(coder->literal_coder,
79 coder->now_pos, coder->previous_byte);
81 if (is_literal_state(coder->state)) {
82 // Previous LZMA-symbol was a literal. Encode a normal
83 // literal without a match byte.
84 literal_normal(&coder->rc, subcoder, cur_byte);
86 // Previous LZMA-symbol was a match. Use the last byte of
87 // the match as a "match byte". That is, compare the bits
88 // of the current literal and the match byte.
89 const uint8_t match_byte = coder->lz.buffer[
90 coder->lz.read_pos - coder->reps[0] - 1
91 - coder->additional_offset];
92 literal_matched(&coder->rc, subcoder, match_byte, cur_byte);
95 update_literal(coder->state);
96 coder->previous_byte = cur_byte;
105 length(lzma_range_encoder *rc, lzma_length_encoder *lc,
106 const uint32_t pos_state, uint32_t len)
108 assert(len <= MATCH_MAX_LEN);
109 len -= MATCH_MIN_LEN;
111 if (len < LEN_LOW_SYMBOLS) {
112 rc_bit(rc, &lc->choice, 0);
113 rc_bittree(rc, lc->low[pos_state], LEN_LOW_BITS, len);
115 rc_bit(rc, &lc->choice, 1);
116 len -= LEN_LOW_SYMBOLS;
118 if (len < LEN_MID_SYMBOLS) {
119 rc_bit(rc, &lc->choice2, 0);
120 rc_bittree(rc, lc->mid[pos_state], LEN_MID_BITS, len);
122 rc_bit(rc, &lc->choice2, 1);
123 len -= LEN_MID_SYMBOLS;
124 rc_bittree(rc, lc->high, LEN_HIGH_BITS, len);
135 match(lzma_coder *coder, const uint32_t pos_state,
136 const uint32_t distance, const uint32_t len)
138 update_match(coder->state);
140 length(&coder->rc, &coder->match_len_encoder, pos_state, len);
141 coder->prev_len_encoder = &coder->match_len_encoder;
143 const uint32_t pos_slot = get_pos_slot(distance);
144 const uint32_t len_to_pos_state = get_len_to_pos_state(len);
145 rc_bittree(&coder->rc, coder->pos_slot_encoder[len_to_pos_state],
146 POS_SLOT_BITS, pos_slot);
148 if (pos_slot >= START_POS_MODEL_INDEX) {
149 const uint32_t footer_bits = (pos_slot >> 1) - 1;
150 const uint32_t base = (2 | (pos_slot & 1)) << footer_bits;
151 const uint32_t pos_reduced = distance - base;
153 if (pos_slot < END_POS_MODEL_INDEX) {
154 rc_bittree_reverse(&coder->rc,
155 &coder->pos_encoders[base - pos_slot - 1],
156 footer_bits, pos_reduced);
158 rc_direct(&coder->rc, pos_reduced >> ALIGN_BITS,
159 footer_bits - ALIGN_BITS);
161 &coder->rc, coder->pos_align_encoder,
162 ALIGN_BITS, pos_reduced & ALIGN_MASK);
163 ++coder->align_price_count;
167 coder->reps[3] = coder->reps[2];
168 coder->reps[2] = coder->reps[1];
169 coder->reps[1] = coder->reps[0];
170 coder->reps[0] = distance;
171 ++coder->match_price_count;
180 rep_match(lzma_coder *coder, const uint32_t pos_state,
181 const uint32_t rep, const uint32_t len)
184 rc_bit(&coder->rc, &coder->is_rep0[coder->state], 0);
186 &coder->is_rep0_long[coder->state][pos_state],
189 const uint32_t distance = coder->reps[rep];
190 rc_bit(&coder->rc, &coder->is_rep0[coder->state], 1);
193 rc_bit(&coder->rc, &coder->is_rep1[coder->state], 0);
195 rc_bit(&coder->rc, &coder->is_rep1[coder->state], 1);
196 rc_bit(&coder->rc, &coder->is_rep2[coder->state],
200 coder->reps[3] = coder->reps[2];
202 coder->reps[2] = coder->reps[1];
205 coder->reps[1] = coder->reps[0];
206 coder->reps[0] = distance;
210 update_short_rep(coder->state);
212 length(&coder->rc, &coder->rep_len_encoder, pos_state, len);
213 coder->prev_len_encoder = &coder->rep_len_encoder;
214 update_long_rep(coder->state);
224 encode_symbol(lzma_coder *coder, uint32_t pos, uint32_t len)
226 const uint32_t pos_state = coder->now_pos & coder->pos_mask;
228 if (len == 1 && pos == UINT32_MAX) {
229 // Literal i.e. eight-bit byte
231 &coder->is_match[coder->state][pos_state], 0);
234 // Some type of match
236 &coder->is_match[coder->state][pos_state], 1);
238 if (pos < REP_DISTANCES) {
239 // It's a repeated match i.e. the same distance
240 // has been used earlier.
241 rc_bit(&coder->rc, &coder->is_rep[coder->state], 1);
242 rep_match(coder, pos_state, pos, len);
245 rc_bit(&coder->rc, &coder->is_rep[coder->state], 0);
246 match(coder, pos_state, pos - REP_DISTANCES, len);
249 coder->previous_byte = coder->lz.buffer[
250 coder->lz.read_pos + len - 1
251 - coder->additional_offset];
254 assert(coder->additional_offset >= len);
255 coder->additional_offset -= len;
256 coder->now_pos += len;
261 encode_eopm(lzma_coder *coder)
263 const uint32_t pos_state = coder->now_pos & coder->pos_mask;
264 rc_bit(&coder->rc, &coder->is_match[coder->state][pos_state], 1);
265 rc_bit(&coder->rc, &coder->is_rep[coder->state], 0);
266 match(coder, pos_state, UINT32_MAX, MATCH_MIN_LEN);
271 * \brief LZMA encoder
273 * \return true if end of stream was reached, false otherwise.
276 lzma_lzma_encode(lzma_coder *coder, uint8_t *restrict out,
277 size_t *restrict out_pos, size_t out_size)
279 // Initialize the stream if no data has been encoded yet.
280 if (!coder->is_initialized) {
281 if (coder->lz.read_pos == coder->lz.read_limit) {
282 if (coder->lz.sequence == SEQ_RUN)
283 return false; // We cannot do anything.
285 // We are finishing (we cannot get here when flushing).
286 assert(coder->lz.write_pos == coder->lz.read_pos);
287 assert(coder->lz.sequence == SEQ_FINISH);
289 // Do the actual initialization.
291 uint32_t num_distance_pairs;
292 lzma_read_match_distances(coder,
293 &len, &num_distance_pairs);
295 encode_symbol(coder, UINT32_MAX, 1);
297 assert(coder->additional_offset == 0);
300 // Initialization is done (except if empty file).
301 coder->is_initialized = true;
306 // Encode pending bits, if any.
307 if (rc_encode(&coder->rc, out, out_pos, out_size))
310 // Check that there is some input to process.
311 if (coder->lz.read_pos >= coder->lz.read_limit) {
312 // If flushing or finishing, we must keep encoding
313 // until additional_offset becomes zero to make
314 // all the input available at output.
315 if (coder->lz.sequence == SEQ_RUN)
318 if (coder->additional_offset == 0)
322 assert(coder->lz.read_pos <= coder->lz.write_pos);
325 if (coder->lz.sequence != SEQ_RUN) {
326 assert(coder->lz.read_limit == coder->lz.write_pos);
328 assert(coder->lz.read_limit + coder->lz.keep_size_after
329 == coder->lz.write_pos);
336 // Get optimal match (repeat position and length).
337 // Value ranges for pos:
338 // - [0, REP_DISTANCES): repeated match
339 // - [REP_DISTANCES, UINT32_MAX):
340 // match at (pos - REP_DISTANCES)
341 // - UINT32_MAX: not a match but a literal
342 // Value ranges for len:
343 // - [MATCH_MIN_LEN, MATCH_MAX_LEN]
344 if (coder->best_compression)
345 lzma_get_optimum(coder, &pos, &len);
347 lzma_get_optimum_fast(coder, &pos, &len);
349 encode_symbol(coder, pos, len);
352 assert(!coder->longest_match_was_found);
354 if (coder->is_flushed) {
355 coder->is_flushed = false;
359 // We don't support encoding old LZMA streams without EOPM, and LZMA2
360 // doesn't use EOPM at LZMA level.
361 if (coder->write_eopm)
364 rc_flush(&coder->rc);
366 if (rc_encode(&coder->rc, out, out_pos, out_size)) {
367 coder->is_flushed = true;