]> icculus.org git repositories - icculus/xz.git/blob - src/liblzma/lzma/lzma_encoder.c
Comments
[icculus/xz.git] / src / liblzma / lzma / lzma_encoder.c
1 ///////////////////////////////////////////////////////////////////////////////
2 //
3 /// \file       lzma_encoder.c
4 /// \brief      LZMA encoder
5 //
6 //  Copyright (C) 1999-2006 Igor Pavlov
7 //  Copyright (C) 2007 Lasse Collin
8 //
9 //  This library is free software; you can redistribute it and/or
10 //  modify it under the terms of the GNU Lesser General Public
11 //  License as published by the Free Software Foundation; either
12 //  version 2.1 of the License, or (at your option) any later version.
13 //
14 //  This library is distributed in the hope that it will be useful,
15 //  but WITHOUT ANY WARRANTY; without even the implied warranty of
16 //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17 //  Lesser General Public License for more details.
18 //
19 ///////////////////////////////////////////////////////////////////////////////
20
21 #include "lzma_encoder_private.h"
22 #include "fastpos.h"
23
24
25 /////////////
26 // Literal //
27 /////////////
28
29 static inline void
30 literal_normal(lzma_range_encoder *rc, probability *subcoder, uint32_t symbol)
31 {
32         uint32_t context = 1;
33         uint32_t bit_count = 8; // Bits per byte
34         do {
35                 const uint32_t bit = (symbol >> --bit_count) & 1;
36                 rc_bit(rc, &subcoder[context], bit);
37                 context = (context << 1) | bit;
38         } while (bit_count != 0);
39 }
40
41
42 static inline void
43 literal_matched(lzma_range_encoder *rc, probability *subcoder,
44                 uint32_t match_byte, uint32_t symbol)
45 {
46         uint32_t context = 1;
47         uint32_t bit_count = 8;
48
49         do {
50                 uint32_t bit = (symbol >> --bit_count) & 1;
51                 const uint32_t match_bit = (match_byte >> bit_count) & 1;
52                 rc_bit(rc, &subcoder[(0x100 << match_bit) + context], bit);
53                 context = (context << 1) | bit;
54
55                 if (match_bit != bit) {
56                         // The bit from the literal being encoded and the bit
57                         // from the previous match differ. Finish encoding
58                         // as a normal literal.
59                         while (bit_count != 0) {
60                                 bit = (symbol >> --bit_count) & 1;
61                                 rc_bit(rc, &subcoder[context], bit);
62                                 context = (context << 1) | bit;
63                         }
64
65                         break;
66                 }
67
68         } while (bit_count != 0);
69 }
70
71
72 static inline void
73 literal(lzma_coder *coder)
74 {
75         // Locate the literal byte to be encoded and the subcoder.
76         const uint8_t cur_byte = coder->lz.buffer[
77                         coder->lz.read_pos - coder->additional_offset];
78         probability *subcoder = literal_get_subcoder(coder->literal_coder,
79                         coder->now_pos, coder->previous_byte);
80
81         if (is_literal_state(coder->state)) {
82                 // Previous LZMA-symbol was a literal. Encode a normal
83                 // literal without a match byte.
84                 literal_normal(&coder->rc, subcoder, cur_byte);
85         } else {
86                 // Previous LZMA-symbol was a match. Use the last byte of
87                 // the match as a "match byte". That is, compare the bits
88                 // of the current literal and the match byte.
89                 const uint8_t match_byte = coder->lz.buffer[
90                                 coder->lz.read_pos - coder->reps[0] - 1
91                                 - coder->additional_offset];
92                 literal_matched(&coder->rc, subcoder, match_byte, cur_byte);
93         }
94
95         update_literal(coder->state);
96         coder->previous_byte = cur_byte;
97 }
98
99
100 //////////////////
101 // Match length //
102 //////////////////
103
104 static inline void
105 length(lzma_range_encoder *rc, lzma_length_encoder *lc,
106                 const uint32_t pos_state, uint32_t len)
107 {
108         assert(len <= MATCH_MAX_LEN);
109         len -= MATCH_MIN_LEN;
110
111         if (len < LEN_LOW_SYMBOLS) {
112                 rc_bit(rc, &lc->choice, 0);
113                 rc_bittree(rc, lc->low[pos_state], LEN_LOW_BITS, len);
114         } else {
115                 rc_bit(rc, &lc->choice, 1);
116                 len -= LEN_LOW_SYMBOLS;
117
118                 if (len < LEN_MID_SYMBOLS) {
119                         rc_bit(rc, &lc->choice2, 0);
120                         rc_bittree(rc, lc->mid[pos_state], LEN_MID_BITS, len);
121                 } else {
122                         rc_bit(rc, &lc->choice2, 1);
123                         len -= LEN_MID_SYMBOLS;
124                         rc_bittree(rc, lc->high, LEN_HIGH_BITS, len);
125                 }
126         }
127 }
128
129
130 ///////////
131 // Match //
132 ///////////
133
134 static inline void
135 match(lzma_coder *coder, const uint32_t pos_state,
136                 const uint32_t distance, const uint32_t len)
137 {
138         update_match(coder->state);
139
140         length(&coder->rc, &coder->match_len_encoder, pos_state, len);
141         coder->prev_len_encoder = &coder->match_len_encoder;
142
143         const uint32_t pos_slot = get_pos_slot(distance);
144         const uint32_t len_to_pos_state = get_len_to_pos_state(len);
145         rc_bittree(&coder->rc, coder->pos_slot_encoder[len_to_pos_state],
146                         POS_SLOT_BITS, pos_slot);
147
148         if (pos_slot >= START_POS_MODEL_INDEX) {
149                 const uint32_t footer_bits = (pos_slot >> 1) - 1;
150                 const uint32_t base = (2 | (pos_slot & 1)) << footer_bits;
151                 const uint32_t pos_reduced = distance - base;
152
153                 if (pos_slot < END_POS_MODEL_INDEX) {
154                         rc_bittree_reverse(&coder->rc,
155                                 &coder->pos_encoders[base - pos_slot - 1],
156                                 footer_bits, pos_reduced);
157                 } else {
158                         rc_direct(&coder->rc, pos_reduced >> ALIGN_BITS,
159                                         footer_bits - ALIGN_BITS);
160                         rc_bittree_reverse(
161                                         &coder->rc, coder->pos_align_encoder,
162                                         ALIGN_BITS, pos_reduced & ALIGN_MASK);
163                         ++coder->align_price_count;
164                 }
165         }
166
167         coder->reps[3] = coder->reps[2];
168         coder->reps[2] = coder->reps[1];
169         coder->reps[1] = coder->reps[0];
170         coder->reps[0] = distance;
171         ++coder->match_price_count;
172 }
173
174
175 ////////////////////
176 // Repeated match //
177 ////////////////////
178
179 static inline void
180 rep_match(lzma_coder *coder, const uint32_t pos_state,
181                 const uint32_t rep, const uint32_t len)
182 {
183         if (rep == 0) {
184                 rc_bit(&coder->rc, &coder->is_rep0[coder->state], 0);
185                 rc_bit(&coder->rc,
186                                 &coder->is_rep0_long[coder->state][pos_state],
187                                 len != 1);
188         } else {
189                 const uint32_t distance = coder->reps[rep];
190                 rc_bit(&coder->rc, &coder->is_rep0[coder->state], 1);
191
192                 if (rep == 1) {
193                         rc_bit(&coder->rc, &coder->is_rep1[coder->state], 0);
194                 } else {
195                         rc_bit(&coder->rc, &coder->is_rep1[coder->state], 1);
196                         rc_bit(&coder->rc, &coder->is_rep2[coder->state],
197                                         rep - 2);
198
199                         if (rep == 3)
200                                 coder->reps[3] = coder->reps[2];
201
202                         coder->reps[2] = coder->reps[1];
203                 }
204
205                 coder->reps[1] = coder->reps[0];
206                 coder->reps[0] = distance;
207         }
208
209         if (len == 1) {
210                 update_short_rep(coder->state);
211         } else {
212                 length(&coder->rc, &coder->rep_len_encoder, pos_state, len);
213                 coder->prev_len_encoder = &coder->rep_len_encoder;
214                 update_long_rep(coder->state);
215         }
216 }
217
218
219 //////////
220 // Main //
221 //////////
222
223 static void
224 encode_symbol(lzma_coder *coder, uint32_t pos, uint32_t len)
225 {
226         const uint32_t pos_state = coder->now_pos & coder->pos_mask;
227
228         if (len == 1 && pos == UINT32_MAX) {
229                 // Literal i.e. eight-bit byte
230                 rc_bit(&coder->rc,
231                                 &coder->is_match[coder->state][pos_state], 0);
232                 literal(coder);
233         } else {
234                 // Some type of match
235                 rc_bit(&coder->rc,
236                         &coder->is_match[coder->state][pos_state], 1);
237
238                 if (pos < REP_DISTANCES) {
239                         // It's a repeated match i.e. the same distance
240                         // has been used earlier.
241                         rc_bit(&coder->rc, &coder->is_rep[coder->state], 1);
242                         rep_match(coder, pos_state, pos, len);
243                 } else {
244                         // Normal match
245                         rc_bit(&coder->rc, &coder->is_rep[coder->state], 0);
246                         match(coder, pos_state, pos - REP_DISTANCES, len);
247                 }
248
249                 coder->previous_byte = coder->lz.buffer[
250                                 coder->lz.read_pos + len - 1
251                                 - coder->additional_offset];
252         }
253
254         assert(coder->additional_offset >= len);
255         coder->additional_offset -= len;
256         coder->now_pos += len;
257 }
258
259
260 static void
261 encode_eopm(lzma_coder *coder)
262 {
263         const uint32_t pos_state = coder->now_pos & coder->pos_mask;
264         rc_bit(&coder->rc, &coder->is_match[coder->state][pos_state], 1);
265         rc_bit(&coder->rc, &coder->is_rep[coder->state], 0);
266         match(coder, pos_state, UINT32_MAX, MATCH_MIN_LEN);
267 }
268
269
270 /**
271  * \brief       LZMA encoder
272  *
273  * \return      true if end of stream was reached, false otherwise.
274  */
275 extern bool
276 lzma_lzma_encode(lzma_coder *coder, uint8_t *restrict out,
277                 size_t *restrict out_pos, size_t out_size)
278 {
279         // Initialize the stream if no data has been encoded yet.
280         if (!coder->is_initialized) {
281                 if (coder->lz.read_pos == coder->lz.read_limit) {
282                         if (coder->lz.sequence == SEQ_RUN)
283                                 return false; // We cannot do anything.
284
285                         // We are finishing (we cannot get here when flushing).
286                         assert(coder->lz.write_pos == coder->lz.read_pos);
287                         assert(coder->lz.sequence == SEQ_FINISH);
288                 } else {
289                         // Do the actual initialization.
290                         uint32_t len;
291                         uint32_t num_distance_pairs;
292                         lzma_read_match_distances(coder,
293                                         &len, &num_distance_pairs);
294
295                         encode_symbol(coder, UINT32_MAX, 1);
296
297                         assert(coder->additional_offset == 0);
298                 }
299
300                 // Initialization is done (except if empty file).
301                 coder->is_initialized = true;
302         }
303
304         // Encoding loop
305         while (true) {
306                 // Encode pending bits, if any.
307                 if (rc_encode(&coder->rc, out, out_pos, out_size))
308                         return false;
309
310                 // Check that there is some input to process.
311                 if (coder->lz.read_pos >= coder->lz.read_limit) {
312                         // If flushing or finishing, we must keep encoding
313                         // until additional_offset becomes zero to make
314                         // all the input available at output.
315                         if (coder->lz.sequence == SEQ_RUN)
316                                 return false;
317
318                         if (coder->additional_offset == 0)
319                                 break;
320                 }
321
322                 assert(coder->lz.read_pos <= coder->lz.write_pos);
323
324 #ifndef NDEBUG
325                 if (coder->lz.sequence != SEQ_RUN) {
326                         assert(coder->lz.read_limit == coder->lz.write_pos);
327                 } else {
328                         assert(coder->lz.read_limit + coder->lz.keep_size_after
329                                         == coder->lz.write_pos);
330                 }
331 #endif
332
333                 uint32_t pos;
334                 uint32_t len;
335
336                 // Get optimal match (repeat position and length).
337                 // Value ranges for pos:
338                 //   - [0, REP_DISTANCES): repeated match
339                 //   - [REP_DISTANCES, UINT32_MAX):
340                 //     match at (pos - REP_DISTANCES)
341                 //   - UINT32_MAX: not a match but a literal
342                 // Value ranges for len:
343                 //   - [MATCH_MIN_LEN, MATCH_MAX_LEN]
344                 if (coder->best_compression)
345                         lzma_get_optimum(coder, &pos, &len);
346                 else
347                         lzma_get_optimum_fast(coder, &pos, &len);
348
349                 encode_symbol(coder, pos, len);
350         }
351
352         assert(!coder->longest_match_was_found);
353
354         if (coder->is_flushed) {
355                 coder->is_flushed = false;
356                 return true;
357         }
358
359         // We don't support encoding old LZMA streams without EOPM, and LZMA2
360         // doesn't use EOPM at LZMA level.
361         if (coder->write_eopm)
362                 encode_eopm(coder);
363
364         rc_flush(&coder->rc);
365
366         if (rc_encode(&coder->rc, out, out_pos, out_size)) {
367                 coder->is_flushed = true;
368                 return false;
369         }
370
371         return true;
372 }