mozjpeg 源码在此 https://github.com/mozilla/mozjpeg
主要特性有:

  • 兼容libjpeg,可以无缝将libjpeg替换成mozjpeg。
  • 渐进式编码。
  • 网格量化法,这个特性也是我最关心的特性,这个特性实现了最大化压缩质量。
  • 自定义量化表。
  • 完全兼容web浏览器。

mozjpeg

一级

jpeg_finish_compress() 主要的循环在这里面

jcapimin.c

line: 173

主要的循环在这里

while(is_last_pass) {
prepare_for_pass() 控制本次循环的走向
}

核心函数prepare_for_pass()

各个阶段的初始化都是在此完成的。

jcmaster.c line:460

例如:
case: main_pass

case:output_pass

case: trellis_pass 网格量化

case: huff_opt_pass

compress_data() compress_output()

####### jccoefct.c line:156 和 line:498

encode_mcu_AC_first

######## jcarith.c line:460
编码核心
该步骤也是耗时最长,优化效果最好的部分

依据标准实现
https://www.w3.org/Graphics/JPEG/itu-t81.pdf
主要是yong’lai算DCT的

arith_encode()

######### jcarith.c line:225

compress_trellis_pass()

jccoefct.c line: 353

改函数结束时,已经完成了DCT, 量化,网格量化的过程,因为最后调用了compress_output(), 把数据传递给了熵编码器处理了,即哈夫曼编码或者算数编码。

quantize_trellis()

jcdctmgr.c line: 930

compress_output()
encode_mcu_AC_refine
encode_mcu_DC_first
access_virt_barray()

######### jccoefct.c line:515

access_virt_barray()

######## jmemmgr.c line:906

To read the contents of a JPEG file as DCT coefficients, open the file and do
jpeg_read_header() as usual. But instead of calling jpeg_start_decompress()
and jpeg_read_scanlines(), call jpeg_read_coefficients(). This will read the
entire image into a set of virtual coefficient-block arrays, one array per
component. The return value is a pointer to an array of virtual-array
descriptors. Each virtual array can be accessed directly using the JPEG
memory manager’s access_virt_barray method (see Memory management, below,
and also read structure.txt’s discussion of virtual array handling). Or,
for simple transcoding to a different JPEG file format, the array list can
just be handed directly to jpeg_write_coefficients().

jpeg_start_compress()

jcapistd.c line:40

compress_first_pass()

jccoefct.c line:260

4次循环forward_DCT()

forward_DCT()

######## jcdctmgr.c line:685

quantize()

jcdctmgr.c line: 51

在forward_DCT()里
先做forward DCT,再算出来量化系数

####### coef_blocks

(*do_quantize) (coef_blocks[bi], divisors, workspace)

大概就是: coef_blocks = workspace/divisors

1
/* Quantize/descale the coefficients, and store into coef_blocks[] *

jcdctmgr.c line:750

量化系数存储在coef_blocks line:676
/*

  • Perform forward DCT on one or more blocks of a component.
    *
  • The input samples are taken from the sample_data[] array starting at
  • position start_row/start_col, and moving to the right for any additional
  • blocks. The quantized coefficients are returned in coef_blocks[].
    */

######## divisors

############ start_pass_fdctmgr()
jcdctmgr.c line : 245

赋值给divisors

######## workspace

############ (*do_preprocess) (workspace, qtbl)

是通过量化表计算出来的。

compress_output()

####### jccoefct.c line:499

我们来重点看下网格量化是怎么回事。
quantize_trellis()代码位于jcdctmgr.c文件内

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
GLOBAL(void)
quantize_trellis(j_compress_ptr cinfo,
c_derived_tbl *dctbl, DC哈夫曼编码相关table
c_derived_tbl *actbl, AC哈夫曼编码相关table
JBLOCKROW coef_blocks, //thisblockrow forward_DCT和量化后产生的系数矩阵
JBLOCKROW src,
JDIMENSION num_blocks,
JQUANT_TBL * qtbl,
double *norm_src, // 已打印
double *norm_coef, // 已打印
JCOEF *last_dc_val,
JBLOCKROW coef_blocks_above,
JBLOCKROW src_above
)
{
int i, j, k, l;
float accumulated_zero_dist[DCTSIZE2];
float accumulated_cost[DCTSIZE2];
int run_start[DCTSIZE2];
int bi;
float best_cost;
int last_coeff_idx; /* position of last nonzero coefficient */
float norm = 0.0;
float lambda_base;
float lambda;
float lambda_dc;
const float *lambda_tbl = (cinfo->master->use_lambda_weight_tbl) ?
jpeg_lambda_weights_csf_luma :
jpeg_lambda_weights_flat;
int Ss, Se;
float *accumulated_zero_block_cost = NULL;
float *accumulated_block_cost = NULL;
int *block_run_start = NULL;
int *requires_eob = NULL;
int has_eob;
float cost_all_zeros;
float best_cost_skip;
float cost;
int zero_run;
int run_bits;
int rate;
float *accumulated_dc_cost[DC_TRELLIS_MAX_CANDIDATES];
int *dc_cost_backtrack[DC_TRELLIS_MAX_CANDIDATES];
JCOEF *dc_candidate[DC_TRELLIS_MAX_CANDIDATES];
int mode = 1;
float lambda_table[DCTSIZE2];
const int dc_trellis_candidates = get_num_dc_trellis_candidates(qtbl->quantval[0]);

Ss = cinfo->Ss;
Se = cinfo->Se;
if (Ss == 0)
Ss = 1;
if (Se < Ss)
return;
if (cinfo->master->trellis_eob_opt) {
accumulated_zero_block_cost = (float *)malloc((num_blocks + 1) * sizeof(float));
accumulated_block_cost = (float *)malloc((num_blocks + 1) * sizeof(float));
block_run_start = (int *)malloc(num_blocks * sizeof(int));
requires_eob = (int *)malloc((num_blocks + 1) * sizeof(int));
if (!accumulated_zero_block_cost ||
!accumulated_block_cost ||
!block_run_start ||
!requires_eob) {
ERREXIT(cinfo, JERR_OUT_OF_MEMORY);
}

accumulated_zero_block_cost[0] = 0;
accumulated_block_cost[0] = 0;
requires_eob[0] = 0;
}

if (cinfo->master->trellis_quant_dc) {
for (i = 0; i < dc_trellis_candidates; i++) {
accumulated_dc_cost[i] = (float *)malloc(num_blocks * sizeof(float));
dc_cost_backtrack[i] = (int *)malloc(num_blocks * sizeof(int));
dc_candidate[i] = (JCOEF *)malloc(num_blocks * sizeof(JCOEF));
if (!accumulated_dc_cost[i] ||
!dc_cost_backtrack[i] ||
!dc_candidate[i]) {
ERREXIT(cinfo, JERR_OUT_OF_MEMORY);
}
}
}

norm = 0.0;
for (i = 1; i < DCTSIZE2; i++) {
norm += qtbl->quantval[i] * qtbl->quantval[i];
}
norm /= 63.0;

if (mode == 1) {
lambda_base = 1.0;
lambda_tbl = lambda_table;
for (i = 0; i < DCTSIZE2; i++)
lambda_table[i] = 1.0 / (qtbl->quantval[i] * qtbl->quantval[i]);
} else
lambda_base = 1.0 / norm;

for (bi = 0; bi < num_blocks; bi++) {

norm = 0.0;
for (i = 1; i < DCTSIZE2; i++) {
norm += src[bi][i] * src[bi][i];
}
norm /= 63.0;

if (cinfo->master->lambda_log_scale2 > 0.0)
lambda = pow(2.0, cinfo->master->lambda_log_scale1) * lambda_base /
(pow(2.0, cinfo->master->lambda_log_scale2) + norm);
else
lambda = pow(2.0, cinfo->master->lambda_log_scale1 - 12.0) * lambda_base;

lambda_dc = lambda * lambda_tbl[0];

accumulated_zero_dist[Ss-1] = 0.0;
accumulated_cost[Ss-1] = 0.0;

/* Do DC coefficient */
if (cinfo->master->trellis_quant_dc) {
int sign = src[bi][0] >> 31;
int x = abs(src[bi][0]);
int q = 8 * qtbl->quantval[0];
int qval;
float dc_candidate_dist;

qval = (x + q/2) / q; /* quantized value (round nearest) */
for (k = 0; k < dc_trellis_candidates; k++) {
int delta;
int dc_delta;
int bits;

dc_candidate[k][bi] = qval - dc_trellis_candidates/2 + k;
if (dc_candidate[k][bi] >= (1<<MAX_COEF_BITS))
dc_candidate[k][bi] = (1<<MAX_COEF_BITS)-1;
if (dc_candidate[k][bi] <= -(1<<MAX_COEF_BITS))
dc_candidate[k][bi] = -(1<<MAX_COEF_BITS)+1;

delta = dc_candidate[k][bi] * q - x;
dc_candidate_dist = delta * delta * lambda_dc;
dc_candidate[k][bi] *= 1 + 2*sign;

/* Take into account DC differences */
if (coef_blocks_above && src_above && cinfo->master->trellis_delta_dc_weight > 0.0) {
int dc_above_orig;
int dc_above_recon;
int dc_orig;
int dc_recon;
float vertical_dist;

dc_above_orig = src_above[bi][0];
dc_above_recon = coef_blocks_above[bi][0] * q;
dc_orig = src[bi][0];
dc_recon = dc_candidate[k][bi] * q;
/* delta is difference of vertical gradients */
delta = (dc_above_orig - dc_orig) - (dc_above_recon - dc_recon);
vertical_dist = delta * delta * lambda_dc;
dc_candidate_dist += cinfo->master->trellis_delta_dc_weight * (vertical_dist - dc_candidate_dist);
}

if (bi == 0) {
dc_delta = dc_candidate[k][bi] - *last_dc_val;

/* Derive number of suffix bits */
bits = 0;
dc_delta = abs(dc_delta);
while (dc_delta) {
dc_delta >>= 1;
bits++;
}
cost = bits + dctbl->ehufsi[bits] + dc_candidate_dist;
accumulated_dc_cost[k][0] = cost;
dc_cost_backtrack[k][0] = -1;
} else {
for (l = 0; l < dc_trellis_candidates; l++) {
dc_delta = dc_candidate[k][bi] - dc_candidate[l][bi-1];

/* Derive number of suffix bits */
bits = 0;
dc_delta = abs(dc_delta);
while (dc_delta) {
dc_delta >>= 1;
bits++;
}
cost = bits + dctbl->ehufsi[bits] + dc_candidate_dist + accumulated_dc_cost[l][bi-1];
if (l == 0 || cost < accumulated_dc_cost[k][bi]) {
accumulated_dc_cost[k][bi] = cost;
dc_cost_backtrack[k][bi] = l;
}
}
}
}
}

/* Do AC coefficients */
for (i = Ss; i <= Se; i++) {
int z = jpeg_natural_order[i];

int sign = src[bi][z] >> 31;
int x = abs(src[bi][z]);
int q = 8 * qtbl->quantval[z];
int candidate[16];
int candidate_bits[16];
float candidate_dist[16];
int num_candidates;
int qval;

accumulated_zero_dist[i] = x * x * lambda * lambda_tbl[z] + accumulated_zero_dist[i-1];

qval = (x + q/2) / q; /* quantized value (round nearest) */

if (qval == 0) {
coef_blocks[bi][z] = 0;
accumulated_cost[i] = 1e38; /* Shouldn't be needed */
continue;
}

if (qval >= (1<<MAX_COEF_BITS))
qval = (1<<MAX_COEF_BITS)-1;

num_candidates = jpeg_nbits_table[qval];
for (k = 0; k < num_candidates; k++) {
int delta;
candidate[k] = (k < num_candidates - 1) ? (2 << k) - 1 : qval;
delta = candidate[k] * q - x;
candidate_bits[k] = k+1;
candidate_dist[k] = delta * delta * lambda * lambda_tbl[z];
}

accumulated_cost[i] = 1e38;

for (j = Ss-1; j < i; j++) {
int zz = jpeg_natural_order[j];
if (j != Ss-1 && coef_blocks[bi][zz] == 0)
continue;

zero_run = i - 1 - j;
if ((zero_run >> 4) && actbl->ehufsi[0xf0] == 0)
continue;

run_bits = (zero_run >> 4) * actbl->ehufsi[0xf0];
zero_run &= 15;

for (k = 0; k < num_candidates; k++) {
int coef_bits = actbl->ehufsi[16 * zero_run + candidate_bits[k]];
if (coef_bits == 0)
continue;

rate = coef_bits + candidate_bits[k] + run_bits;
cost = rate + candidate_dist[k];
cost += accumulated_zero_dist[i-1] - accumulated_zero_dist[j] + accumulated_cost[j];

if (cost < accumulated_cost[i]) {
coef_blocks[bi][z] = (candidate[k] ^ sign) - sign;
accumulated_cost[i] = cost;
run_start[i] = j;
}
}
}
}

last_coeff_idx = Ss-1;
best_cost = accumulated_zero_dist[Se] + actbl->ehufsi[0];
cost_all_zeros = accumulated_zero_dist[Se];
best_cost_skip = cost_all_zeros;

for (i = Ss; i <= Se; i++) {
int z = jpeg_natural_order[i];
if (coef_blocks[bi][z] != 0) {
float cost = accumulated_cost[i] + accumulated_zero_dist[Se] - accumulated_zero_dist[i];
float cost_wo_eob = cost;

if (i < Se)
cost += actbl->ehufsi[0];

if (cost < best_cost) {
best_cost = cost;
last_coeff_idx = i;
best_cost_skip = cost_wo_eob;
}
}
}

has_eob = (last_coeff_idx < Se) + (last_coeff_idx == Ss-1);

/* Zero out coefficients that are part of runs */
i = Se;
while (i >= Ss)
{
while (i > last_coeff_idx) {
int z = jpeg_natural_order[i];
coef_blocks[bi][z] = 0;
i--;
}
last_coeff_idx = run_start[i];
i--;
}

if (cinfo->master->trellis_eob_opt) {
accumulated_zero_block_cost[bi+1] = accumulated_zero_block_cost[bi];
accumulated_zero_block_cost[bi+1] += cost_all_zeros;
requires_eob[bi+1] = has_eob;

best_cost = 1e38;

if (has_eob != 2) {
for (i = 0; i <= bi; i++) {
int zero_block_run;
int nbits;
float cost;

if (requires_eob[i] == 2)
continue;

cost = best_cost_skip; /* cost of coding a nonzero block */
cost += accumulated_zero_block_cost[bi];
cost -= accumulated_zero_block_cost[i];
cost += accumulated_block_cost[i];
zero_block_run = bi - i + requires_eob[i];
nbits = jpeg_nbits_table[zero_block_run];
cost += actbl->ehufsi[16*nbits] + nbits;

if (cost < best_cost) {
block_run_start[bi] = i;
best_cost = cost;
accumulated_block_cost[bi+1] = cost;
}
}
}
}
}

if (cinfo->master->trellis_eob_opt) {
int last_block = num_blocks;
best_cost = 1e38;

for (i = 0; i <= num_blocks; i++) {
int zero_block_run;
int nbits;
float cost = 0.0;

if (requires_eob[i] == 2)
continue;

cost += accumulated_zero_block_cost[num_blocks];
cost -= accumulated_zero_block_cost[i];
zero_block_run = num_blocks - i + requires_eob[i];
nbits = jpeg_nbits_table[zero_block_run];
cost += actbl->ehufsi[16*nbits] + nbits;
if (cost < best_cost) {
best_cost = cost;
last_block = i;
}
}
last_block--;
bi = num_blocks - 1;
while (bi >= 0) {
while (bi > last_block) {
for (j = Ss; j <= Se; j++) {
int z = jpeg_natural_order[j];
coef_blocks[bi][z] = 0;
}
bi--;
}
last_block = block_run_start[bi]-1;
bi--;
}
free(accumulated_zero_block_cost);
free(accumulated_block_cost);
free(block_run_start);
free(requires_eob);
}

if (cinfo->master->trellis_q_opt) {
for (bi = 0; bi < num_blocks; bi++) {
for (i = 1; i < DCTSIZE2; i++) {
norm_src[i] += src[bi][i] * coef_blocks[bi][i];
norm_coef[i] += 8 * coef_blocks[bi][i] * coef_blocks[bi][i];
}
}
}

if (cinfo->master->trellis_quant_dc) {
j = 0;
for (i = 1; i < dc_trellis_candidates; i++) {
if (accumulated_dc_cost[i][num_blocks-1] < accumulated_dc_cost[j][num_blocks-1])
j = i;
}
for (bi = num_blocks-1; bi >= 0; bi--) {
coef_blocks[bi][0] = dc_candidate[j][bi];
j = dc_cost_backtrack[j][bi];
}

/* Save DC predictor */
*last_dc_val = coef_blocks[num_blocks-1][0];

for (i = 0; i < dc_trellis_candidates; i++) {
free(accumulated_dc_cost[i]);
free(dc_cost_backtrack[i]);
free(dc_candidate[i]);
}
}

}
1
2