]> icculus.org git repositories - icculus/xz.git/blob - src/liblzma/lzma/lzma_encoder.c
Fix a typo in lzma_encoder.c.
[icculus/xz.git] / src / liblzma / lzma / lzma_encoder.c
1 ///////////////////////////////////////////////////////////////////////////////
2 //
3 /// \file       lzma_encoder.c
4 /// \brief      LZMA encoder
5 //
6 //  Copyright (C) 1999-2006 Igor Pavlov
7 //  Copyright (C) 2007 Lasse Collin
8 //
9 //  This library is free software; you can redistribute it and/or
10 //  modify it under the terms of the GNU Lesser General Public
11 //  License as published by the Free Software Foundation; either
12 //  version 2.1 of the License, or (at your option) any later version.
13 //
14 //  This library is distributed in the hope that it will be useful,
15 //  but WITHOUT ANY WARRANTY; without even the implied warranty of
16 //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17 //  Lesser General Public License for more details.
18 //
19 ///////////////////////////////////////////////////////////////////////////////
20
21 // NOTE: If you want to keep the line length in 80 characters, set
22 //       tab width to 4 or less in your editor when editing this file.
23
24
25 #include "lzma_encoder_private.h"
26
27
28 ////////////
29 // Macros //
30 ////////////
31
32 // These are as macros mostly because they use local range encoder variables.
33
34 #define literal_encode(subcoder, symbol) \
35 do { \
36         uint32_t context = 1; \
37         int i = 8; \
38         do { \
39                 --i; \
40                 const uint32_t bit = ((symbol) >> i) & 1; \
41                 bit_encode(subcoder[context], bit); \
42                 context = (context << 1) | bit; \
43         } while (i != 0); \
44 } while (0)
45
46
47 #define literal_encode_matched(subcoder, match_byte, symbol) \
48 do { \
49         uint32_t context = 1; \
50         int i = 8; \
51         do { \
52                 --i; \
53                 uint32_t bit = ((symbol) >> i) & 1; \
54                 const uint32_t match_bit = ((match_byte) >> i) & 1; \
55                 const uint32_t subcoder_index = 0x100 + (match_bit << 8) + context; \
56                 bit_encode(subcoder[subcoder_index], bit); \
57                 context = (context << 1) | bit; \
58                 if (match_bit != bit) { \
59                         while (i != 0) { \
60                                 --i; \
61                                 bit = ((symbol) >> i) & 1; \
62                                 bit_encode(subcoder[context], bit); \
63                                 context = (context << 1) | bit; \
64                         } \
65                         break; \
66                 } \
67         } while (i != 0); \
68 } while (0)
69
70
71 #define length_encode(length_encoder, symbol, pos_state, update_price) \
72 do { \
73         \
74         if ((symbol) < LEN_LOW_SYMBOLS) { \
75                 bit_encode_0((length_encoder).choice); \
76                 bittree_encode((length_encoder).low[pos_state], \
77                                 LEN_LOW_BITS, symbol); \
78         } else { \
79                 bit_encode_1((length_encoder).choice); \
80                 if ((symbol) < LEN_LOW_SYMBOLS + LEN_MID_SYMBOLS) { \
81                         bit_encode_0((length_encoder).choice2); \
82                         bittree_encode((length_encoder).mid[pos_state], \
83                                         LEN_MID_BITS, \
84                                         (symbol) - LEN_LOW_SYMBOLS); \
85                 } else { \
86                         bit_encode_1((length_encoder).choice2); \
87                         bittree_encode((length_encoder).high, LEN_HIGH_BITS, \
88                                         (symbol) - LEN_LOW_SYMBOLS \
89                                         - LEN_MID_SYMBOLS); \
90                 } \
91         } \
92         if (update_price) \
93                 if (--(length_encoder).counters[pos_state] == 0) \
94                         lzma_length_encoder_update_table(&(length_encoder), pos_state); \
95 } while (0)
96
97
98 ///////////////
99 // Functions //
100 ///////////////
101
102 /// \brief      Updates price table of the length encoder
103 ///
104 /// Like all the other prices in LZMA, these are used by lzma_get_optimum().
105 ///
106 extern void
107 lzma_length_encoder_update_table(lzma_length_encoder *lencoder,
108                 const uint32_t pos_state)
109 {
110         const uint32_t num_symbols = lencoder->table_size;
111         const uint32_t a0 = bit_get_price_0(lencoder->choice);
112         const uint32_t a1 = bit_get_price_1(lencoder->choice);
113         const uint32_t b0 = a1 + bit_get_price_0(lencoder->choice2);
114         const uint32_t b1 = a1 + bit_get_price_1(lencoder->choice2);
115
116         uint32_t *prices = lencoder->prices[pos_state];
117         uint32_t i = 0;
118
119         for (i = 0; i < num_symbols && i < LEN_LOW_SYMBOLS; ++i)
120                 prices[i] = a0 + bittree_get_price(lencoder->low[pos_state],
121                                 LEN_LOW_BITS, i);
122
123         for (; i < num_symbols && i < LEN_LOW_SYMBOLS + LEN_MID_SYMBOLS; ++i)
124                 prices[i] = b0 + bittree_get_price(lencoder->mid[pos_state],
125                                 LEN_MID_BITS, i - LEN_LOW_SYMBOLS);
126
127         for (; i < num_symbols; ++i)
128                 prices[i] = b1 + bittree_get_price(
129                                 lencoder->high, LEN_HIGH_BITS,
130                                 i - LEN_LOW_SYMBOLS - LEN_MID_SYMBOLS);
131
132         lencoder->counters[pos_state] = num_symbols;
133
134         return;
135 }
136
137
138 /**
139  * \brief       LZMA encoder
140  *
141  * \return      true if end of stream was reached, false otherwise.
142  */
143 extern bool
144 lzma_lzma_encode(lzma_coder *coder, uint8_t *restrict out,
145                 size_t *restrict out_pos, size_t out_size)
146 {
147 #define rc_buffer coder->lz.temp
148 #define rc_buffer_size coder->lz.temp_size
149
150         // Local copies
151         lzma_range_encoder rc = coder->rc;
152         size_t out_pos_local = *out_pos;
153         const uint32_t pos_mask = coder->pos_mask;
154         const bool best_compression = coder->best_compression;
155
156         // Initialize the stream if no data has been encoded yet.
157         if (!coder->is_initialized) {
158                 if (coder->lz.read_pos == coder->lz.read_limit) {
159                         switch (coder->lz.sequence) {
160                         case SEQ_RUN:
161                                 // Cannot initialize, because there is
162                                 // no input data.
163                                 return false;
164
165                         case SEQ_FLUSH:
166                                 // Nothing to flush. There cannot be a flush
167                                 // marker when no data has been processed
168                                 // yet (file format doesn't allow it, and
169                                 // it would be just waste of space).
170                                 return true;
171
172                         case SEQ_FINISH:
173                                 // We are encoding an empty file. No need
174                                 // to initialize the encoder.
175                                 assert(coder->lz.write_pos == coder->lz.read_pos);
176                                 break;
177
178                         default:
179                                 // We never get here.
180                                 assert(0);
181                                 return true;
182                         }
183
184                 } else {
185                         // Do the actual initialization.
186                         uint32_t len;
187                         uint32_t num_distance_pairs;
188                         lzma_read_match_distances(coder, &len, &num_distance_pairs);
189
190                         bit_encode_0(coder->is_match[coder->state][0]);
191                         update_char(coder->state);
192
193                         const uint8_t cur_byte = coder->lz.buffer[
194                                         coder->lz.read_pos - coder->additional_offset];
195                         probability *subcoder = literal_get_subcoder(coder->literal_coder,
196                                         coder->now_pos, coder->previous_byte);
197                         literal_encode(subcoder, cur_byte);
198
199                         coder->previous_byte = cur_byte;
200                         --coder->additional_offset;
201                         ++coder->now_pos;
202
203                         assert(coder->additional_offset == 0);
204                 }
205
206                 // Initialization is done (except if empty file).
207                 coder->is_initialized = true;
208         }
209
210         // Encoding loop
211         while (true) {
212                 // Check that there is free output space.
213                 if (out_pos_local == out_size)
214                         break;
215
216                 assert(rc_buffer_size == 0);
217
218                 // Check that there is some input to process.
219                 if (coder->lz.read_pos >= coder->lz.read_limit) {
220                         // If flushing or finishing, we must keep encoding
221                         // until additional_offset becomes zero to make
222                         // all the input available at output.
223                         if (coder->lz.sequence == SEQ_RUN
224                                         || coder->additional_offset == 0)
225                                 break;
226                 }
227
228                 assert(coder->lz.read_pos <= coder->lz.write_pos);
229
230 #ifndef NDEBUG
231                 if (coder->lz.sequence != SEQ_RUN) {
232                         assert(coder->lz.read_limit == coder->lz.write_pos);
233                 } else {
234                         assert(coder->lz.read_limit + coder->lz.keep_size_after
235                                         == coder->lz.write_pos);
236                 }
237 #endif
238
239                 const uint32_t pos_state = coder->now_pos & pos_mask;
240
241                 uint32_t pos;
242                 uint32_t len;
243
244                 // Get optimal match (repeat position and length).
245                 // Value ranges for pos:
246                 //   - [0, REP_DISTANCES): repeated match
247                 //   - [REP_DISTANCES, UINT32_MAX): match at (pos - REP_DISTANCES)
248                 //   - UINT32_MAX: not a match but a literal
249                 // Value ranges for len:
250                 //   - [MATCH_MIN_LEN, MATCH_MAX_LEN]
251                 if (best_compression)
252                         lzma_get_optimum(coder, &pos, &len);
253                 else
254                         lzma_get_optimum_fast(coder, &pos, &len);
255
256                 if (len == 1 && pos == UINT32_MAX) {
257                         // It's a literal.
258                         bit_encode_0(coder->is_match[coder->state][pos_state]);
259
260                         const uint8_t cur_byte = coder->lz.buffer[
261                                         coder->lz.read_pos - coder->additional_offset];
262                         probability *subcoder = literal_get_subcoder(coder->literal_coder,
263                                         coder->now_pos, coder->previous_byte);
264
265                         if (is_char_state(coder->state)) {
266                                 literal_encode(subcoder, cur_byte);
267                         } else {
268                                 const uint8_t match_byte = coder->lz.buffer[
269                                                 coder->lz.read_pos
270                                                 - coder->rep_distances[0] - 1
271                                                 - coder->additional_offset];
272                                 literal_encode_matched(subcoder, match_byte, cur_byte);
273                         }
274
275                         update_char(coder->state);
276                         coder->previous_byte = cur_byte;
277
278                 } else {
279                         // It's a match.
280                         bit_encode_1(coder->is_match[coder->state][pos_state]);
281
282                         if (pos < REP_DISTANCES) {
283                                 // It's a repeated match i.e. the same distance
284                                 // has been used earlier.
285                                 bit_encode_1(coder->is_rep[coder->state]);
286
287                                 if (pos == 0) {
288                                         bit_encode_0(coder->is_rep0[coder->state]);
289                                         const uint32_t symbol = (len == 1) ? 0 : 1;
290                                         bit_encode(coder->is_rep0_long[coder->state][pos_state],
291                                                         symbol);
292                                 } else {
293                                         const uint32_t distance = coder->rep_distances[pos];
294                                         bit_encode_1(coder->is_rep0[coder->state]);
295
296                                         if (pos == 1) {
297                                                 bit_encode_0(coder->is_rep1[coder->state]);
298                                         } else {
299                                                 bit_encode_1(coder->is_rep1[coder->state]);
300                                                 bit_encode(coder->is_rep2[coder->state], pos - 2);
301
302                                                 if (pos == 3)
303                                                         coder->rep_distances[3] = coder->rep_distances[2];
304
305                                                 coder->rep_distances[2] = coder->rep_distances[1];
306                                         }
307
308                                         coder->rep_distances[1] = coder->rep_distances[0];
309                                         coder->rep_distances[0] = distance;
310                                 }
311
312                                 if (len == 1) {
313                                         update_short_rep(coder->state);
314                                 } else {
315                                         length_encode(coder->rep_match_len_encoder,
316                                                         len - MATCH_MIN_LEN, pos_state,
317                                                         best_compression);
318                                         update_rep(coder->state);
319                                 }
320
321                         } else {
322                                 bit_encode_0(coder->is_rep[coder->state]);
323                                 update_match(coder->state);
324                                 length_encode(coder->len_encoder, len - MATCH_MIN_LEN,
325                                                 pos_state, best_compression);
326                                 pos -= REP_DISTANCES;
327
328                                 const uint32_t pos_slot = get_pos_slot(pos);
329                                 const uint32_t len_to_pos_state = get_len_to_pos_state(len);
330                                 bittree_encode(coder->pos_slot_encoder[len_to_pos_state],
331                                                 POS_SLOT_BITS, pos_slot);
332
333                                 if (pos_slot >= START_POS_MODEL_INDEX) {
334                                         const uint32_t footer_bits = (pos_slot >> 1) - 1;
335                                         const uint32_t base = (2 | (pos_slot & 1)) << footer_bits;
336                                         const uint32_t pos_reduced = pos - base;
337
338                                         if (pos_slot < END_POS_MODEL_INDEX) {
339                                                 bittree_reverse_encode(
340                                                                 coder->pos_encoders + base - pos_slot - 1,
341                                                                 footer_bits, pos_reduced);
342                                         } else {
343                                                 rc_encode_direct_bits(pos_reduced >> ALIGN_BITS,
344                                                                 footer_bits - ALIGN_BITS);
345                                                 bittree_reverse_encode(coder->pos_align_encoder,
346                                                                 ALIGN_BITS, pos_reduced & ALIGN_MASK);
347                                                 ++coder->align_price_count;
348                                         }
349                                 }
350
351                                 coder->rep_distances[3] = coder->rep_distances[2];
352                                 coder->rep_distances[2] = coder->rep_distances[1];
353                                 coder->rep_distances[1] = coder->rep_distances[0];
354                                 coder->rep_distances[0] = pos;
355                                 ++coder->match_price_count;
356                         }
357
358                         coder->previous_byte = coder->lz.buffer[
359                                         coder->lz.read_pos + len - 1
360                                         - coder->additional_offset];
361                 }
362
363                 assert(coder->additional_offset >= len);
364                 coder->additional_offset -= len;
365                 coder->now_pos += len;
366         }
367
368         // Check if everything is done.
369         bool all_done = false;
370         if (coder->lz.sequence != SEQ_RUN
371                         && coder->lz.read_pos == coder->lz.write_pos
372                         && coder->additional_offset == 0) {
373                 if (coder->lz.uncompressed_size == LZMA_VLI_VALUE_UNKNOWN
374                                 || coder->lz.sequence == SEQ_FLUSH) {
375                         // Write special marker: flush marker or end of payload
376                         // marker. Both are encoded as a match with distance of
377                         // UINT32_MAX. The match length codes the type of the marker.
378                         const uint32_t pos_state = coder->now_pos & pos_mask;
379                         bit_encode_1(coder->is_match[coder->state][pos_state]);
380                         bit_encode_0(coder->is_rep[coder->state]);
381                         update_match(coder->state);
382
383                         const uint32_t len = coder->lz.sequence == SEQ_FLUSH
384                                         ? LEN_SPECIAL_FLUSH : LEN_SPECIAL_EOPM;
385                         length_encode(coder->len_encoder, len - MATCH_MIN_LEN,
386                                         pos_state, best_compression);
387
388                         const uint32_t pos_slot = (1 << POS_SLOT_BITS) - 1;
389                         const uint32_t len_to_pos_state = get_len_to_pos_state(len);
390                         bittree_encode(coder->pos_slot_encoder[len_to_pos_state],
391                                         POS_SLOT_BITS, pos_slot);
392
393                         const uint32_t footer_bits = 30;
394                         const uint32_t pos_reduced
395                                         = (UINT32_C(1) << footer_bits) - 1;
396                         rc_encode_direct_bits(pos_reduced >> ALIGN_BITS,
397                                         footer_bits - ALIGN_BITS);
398
399                         bittree_reverse_encode(coder->pos_align_encoder, ALIGN_BITS,
400                                         pos_reduced & ALIGN_MASK);
401                 }
402
403                 // Flush the last bytes of compressed data from
404                 // the range coder to the output buffer.
405                 rc_flush();
406
407                 rc_reset(rc);
408
409                 // All done. Note that some output bytes might be
410                 // pending in coder->lz.temp. lzma_lz_encode() will
411                 // take care of those bytes.
412                 all_done = true;
413         }
414
415         // Store local variables back to *coder.
416         coder->rc = rc;
417         *out_pos = out_pos_local;
418
419         return all_done;
420 }