├── README.md
├── Makefile
├── .gitignore
├── .github
    └── workflows
    │   └── semgrep.yml
├── LICENSE
├── bm_test.go
└── bm.go


/README.md:
--------------------------------------------------------------------------------
1 | bm
2 | ==
3 | 
4 | A Golang implementation of the Bentley/McIlroy compression algorithm
5 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | GCFLAGS := -B
 2 | LDFLAGS :=
 3 | 
 4 | .PHONY: install
 5 | install:
 6 | 	@go install -v .
 7 | 
 8 | .PHONY: test
 9 | test:
10 | 	@go test -gcflags='$(GCFLAGS)' -ldflags='$(LDFLAGS)' .
11 | 
12 | .PHONY: bench
13 | bench:
14 | 	@go test -gcflags='$(GCFLAGS)' -ldflags='$(LDFLAGS)' -bench .
15 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects)
 2 | *.o
 3 | *.a
 4 | *.so
 5 | *~
 6 | 
 7 | # Folders
 8 | _obj
 9 | _test
10 | 
11 | # Architecture specific extensions/prefixes
12 | *.[568vq]
13 | [568vq].out
14 | 
15 | *.cgo1.go
16 | *.cgo2.c
17 | _cgo_defun.c
18 | _cgo_gotypes.go
19 | _cgo_export.*
20 | 
21 | _testmain.go
22 | 
23 | *.exe
24 | 


--------------------------------------------------------------------------------
/.github/workflows/semgrep.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | on:
 3 |   pull_request: {}
 4 |   workflow_dispatch: {}
 5 |   push: 
 6 |     branches:
 7 |       - main
 8 |       - master
 9 | name: Semgrep config
10 | jobs:
11 |   semgrep:
12 |     name: semgrep/ci
13 |     runs-on: ubuntu-20.04
14 |     env:
15 |       SEMGREP_APP_TOKEN: ${{ secrets.SEMGREP_APP_TOKEN }}
16 |       SEMGREP_URL: https://cloudflare.semgrep.dev
17 |       SEMGREP_APP_URL: https://cloudflare.semgrep.dev
18 |       SEMGREP_VERSION_CHECK_URL: https://cloudflare.semgrep.dev/api/check-version
19 |     container:
20 |       image: returntocorp/semgrep
21 |     steps:
22 |       - uses: actions/checkout@v3
23 |       - run: semgrep ci
24 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2013 CloudFlare, Inc.
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without modification,
 5 | are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice, this
11 |   list of conditions and the following disclaimer in the documentation and/or
12 |   other materials provided with the distribution.
13 | 
14 | * Neither the name of the CloudFlare, Inc. nor the names of its
15 |   contributors may be used to endorse or promote products derived from
16 |   this software without specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
22 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
25 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 


--------------------------------------------------------------------------------
/bm_test.go:
--------------------------------------------------------------------------------
  1 | // bm_test.go: test suite for bmcompress
  2 | //
  3 | // Copyright (c) 2013 CloudFlare, Inc.
  4 | 
  5 | package bm
  6 | 
  7 | import (
  8 | 	"bytes"
  9 | 	"testing"
 10 | )
 11 | 
 12 | func assert(t *testing.T, b bool) {
 13 | 	if !b {
 14 | 		t.Fail()
 15 | 	}
 16 | }
 17 | 
 18 | func TestCreateCompressor(t *testing.T) {
 19 | 	b := new(bytes.Buffer)
 20 | 	co := NewCompressor()
 21 | 	co.SetWriter(b)
 22 | 	d := new(Dictionary)
 23 | 	assert(t, co.Ratio() == -1)
 24 | 	assert(t, b != nil)
 25 | 	assert(t, co != nil)
 26 | 	assert(t, d != nil)
 27 | }
 28 | 
 29 | func TestSelfCompressAndExpand(t *testing.T) {
 30 | 	b := new(bytes.Buffer)
 31 | 	co := NewCompressor()
 32 | 	co.SetWriter(b)
 33 | 	d := new(Dictionary)
 34 | 	assert(t, b != nil)
 35 | 	assert(t, co != nil)
 36 | 	assert(t, d != nil)
 37 | 	s := []byte("the quick brown fox jumps over the lazy dogthe quick brown fox jumps over the lazy dogthe quick brown fox jumps over the lazy dog")
 38 | 	dict := s
 39 | 	d.Dict = dict
 40 | 	co.SetDictionary(d)
 41 | 	co.Write(s)
 42 | 	assert(t, co.Ratio() == 0)
 43 | 	co.Close()
 44 | 	assert(t, b.Len() > 0)
 45 | 	assert(t, b.Len() == 4)
 46 | 	assert(t, b.Bytes()[0] == 0)
 47 | 	assert(t, b.Bytes()[1] == 0)
 48 | 	assert(t, b.Bytes()[2] == 0x81)
 49 | 	assert(t, b.Bytes()[3] == 1)
 50 | 	assert(t, co.Ratio() > 0)
 51 | 	assert(t, co.Ratio() == (10000*b.Len()/len(s)))
 52 | 	ex := NewExpander(b, dict)
 53 | 	assert(t, ex != nil)
 54 | 	out := make([]byte, 0)
 55 | 	o, err := ex.Expand(out)
 56 | 	assert(t, err == nil)
 57 | 	assert(t, len(o) == len(s))
 58 | 	assert(t, bytes.Compare(o, s) == 0)
 59 | }
 60 | 
 61 | func TestExtraAtEnd(t *testing.T) {
 62 | 	b := new(bytes.Buffer)
 63 | 	co := NewCompressor()
 64 | 	co.SetWriter(b)
 65 | 	d := new(Dictionary)
 66 | 	assert(t, b != nil)
 67 | 	assert(t, co != nil)
 68 | 	assert(t, d != nil)
 69 | 	s := []byte("the quick brown fox jumps over the lazy dogthe quick brown fox jumps over the lazy dogthe quick brown fox jumps over the lazy dog")
 70 | 	dict := s
 71 | 	d.Dict = dict
 72 | 	co.SetDictionary(d)
 73 | 	s = []byte("the quick brown fox jumps over the lazy dogthe quick brown fox jumps over the lazy dogthe quick brown fox jumps over the lazy dogDOG")
 74 | 	co.Write(s)
 75 | 	assert(t, co.Ratio() == 0)
 76 | 	co.Close()
 77 | 	assert(t, b.Len() > 0)
 78 | 	assert(t, b.Len() == 8)
 79 | 	assert(t, b.Bytes()[0] == 0)
 80 | 	assert(t, b.Bytes()[1] == 0)
 81 | 	assert(t, b.Bytes()[2] == 0x81)
 82 | 	assert(t, b.Bytes()[3] == 1)
 83 | 	assert(t, b.Bytes()[4] == 3)
 84 | 	assert(t, co.Ratio() > 0)
 85 | 	assert(t, co.Ratio() == (10000*b.Len()/len(s)))
 86 | 	assert(t, bytes.Compare(b.Bytes()[5:8], []byte("DOG")) == 0)
 87 | 	ex := NewExpander(b, dict)
 88 | 	assert(t, ex != nil)
 89 | 	out := make([]byte, 0)
 90 | 	o, err := ex.Expand(out)
 91 | 	assert(t, err == nil)
 92 | 	assert(t, len(o) == len(s))
 93 | 	assert(t, bytes.Compare(o, s) == 0)
 94 | }
 95 | 
 96 | func TestExtraAtStart(t *testing.T) {
 97 | 	b := new(bytes.Buffer)
 98 | 	co := NewCompressor()
 99 | 	co.SetWriter(b)
100 | 	d := new(Dictionary)
101 | 	assert(t, b != nil)
102 | 	assert(t, co != nil)
103 | 	assert(t, d != nil)
104 | 	s := []byte("the quick brown fox jumps over the lazy dogthe quick brown fox jumps over the lazy dogthe quick brown fox jumps over the lazy dog")
105 | 	dict := s
106 | 	d.Dict = dict
107 | 	co.SetDictionary(d)
108 | 	s = []byte("THEthe quick brown fox jumps over the lazy dogthe quick brown fox jumps over the lazy dogthe quick brown fox jumps over the lazy dog")
109 | 	co.Write(s)
110 | 	assert(t, co.Ratio() == 0)
111 | 	co.Close()
112 | 	assert(t, b.Len() > 0)
113 | 	assert(t, b.Len() == 8)
114 | 	assert(t, b.Bytes()[0] == 3)
115 | 	assert(t, bytes.Compare(b.Bytes()[1:4], []byte("THE")) == 0)
116 | 	assert(t, b.Bytes()[4] == 0)
117 | 	assert(t, b.Bytes()[5] == 0)
118 | 	assert(t, b.Bytes()[6] == 0x81)
119 | 	assert(t, b.Bytes()[7] == 1)
120 | 	assert(t, co.Ratio() > 0)
121 | 	assert(t, co.Ratio() == (10000*b.Len()/len(s)))
122 | 	ex := NewExpander(b, dict)
123 | 	assert(t, ex != nil)
124 | 	out := make([]byte, 0)
125 | 	o, err := ex.Expand(out)
126 | 	assert(t, err == nil)
127 | 	assert(t, len(o) == len(s))
128 | 	assert(t, bytes.Compare(o, s) == 0)
129 | }
130 | 
131 | func TestExtraAtStartAndEnd(t *testing.T) {
132 | 	b := new(bytes.Buffer)
133 | 	co := NewCompressor()
134 | 	co.SetWriter(b)
135 | 	d := new(Dictionary)
136 | 	assert(t, b != nil)
137 | 	assert(t, co != nil)
138 | 	assert(t, d != nil)
139 | 	s := []byte("the quick brown fox jumps over the lazy dogthe quick brown fox jumps over the lazy dogthe quick brown fox jumps over the lazy dog")
140 | 	dict := s
141 | 	d.Dict = dict
142 | 	co.SetDictionary(d)
143 | 	s = []byte("THEthe quick brown fox jumps over the lazy dogthe quick brown fox jumps over the lazy dogthe quick brown fox jumps over the lazy dogDOG")
144 | 	co.Write(s)
145 | 	assert(t, co.Ratio() == 0)
146 | 	co.Close()
147 | 	assert(t, b.Len() > 0)
148 | 	assert(t, b.Len() == 12)
149 | 	assert(t, b.Bytes()[0] == 3)
150 | 	assert(t, bytes.Compare(b.Bytes()[1:4], []byte("THE")) == 0)
151 | 	assert(t, b.Bytes()[4] == 0)
152 | 	assert(t, b.Bytes()[5] == 0)
153 | 	assert(t, b.Bytes()[6] == 0x81)
154 | 	assert(t, b.Bytes()[7] == 1)
155 | 	assert(t, b.Bytes()[8] == 3)
156 | 	assert(t, bytes.Compare(b.Bytes()[9:12], []byte("DOG")) == 0)
157 | 	assert(t, co.Ratio() > 0)
158 | 	assert(t, co.Ratio() == (10000*b.Len()/len(s)))
159 | 	ex := NewExpander(b, dict)
160 | 	assert(t, ex != nil)
161 | 	out := make([]byte, 0)
162 | 	o, err := ex.Expand(out)
163 | 	assert(t, err == nil)
164 | 	assert(t, len(o) == len(s))
165 | 	assert(t, bytes.Compare(o, s) == 0)
166 | }
167 | 
168 | func TestNotQuite(t *testing.T) {
169 | 	b := new(bytes.Buffer)
170 | 	co := NewCompressor()
171 | 	co.SetWriter(b)
172 | 	d := new(Dictionary)
173 | 	assert(t, b != nil)
174 | 	assert(t, co != nil)
175 | 	assert(t, d != nil)
176 | 	s := []byte("the quick brown fox jumps over the lazy dogthe quick brown fox jumps over the lazy dogthe quick brown fox jumps over the lazy dog")
177 | 	dict := s
178 | 	d.Dict = dict
179 | 	co.SetDictionary(d)
180 | 	s = []byte("the quick brown fox jumps over the lazy dogthe quick brown fox jumps over the lazy dog the quick brown fox jumps over the lazy dog")
181 | 	co.Write(s)
182 | 	assert(t, co.Ratio() == 0)
183 | 	co.Close()
184 | 	assert(t, b.Len() > 0)
185 | 	assert(t, b.Len() == 48)
186 | 	assert(t, b.Bytes()[0] == 0)
187 | 	assert(t, b.Bytes()[1] == 0)
188 | 	assert(t, b.Bytes()[2] == 0x56)
189 | 	assert(t, b.Bytes()[3] == 0x2c)
190 | 	assert(t, bytes.Compare(b.Bytes()[4:], []byte(" the quick brown fox jumps over the lazy dog")) == 0)
191 | 	assert(t, co.Ratio() > 0)
192 | 	assert(t, co.Ratio() == (10000*b.Len()/len(s)))
193 | 	ex := NewExpander(b, dict)
194 | 	assert(t, ex != nil)
195 | 	out := make([]byte, 0)
196 | 	o, err := ex.Expand(out)
197 | 	assert(t, err == nil)
198 | 	assert(t, len(o) == len(s))
199 | 	assert(t, bytes.Compare(o, s) == 0)
200 | }
201 | 
202 | func TestBitInTheMiddle(t *testing.T) {
203 | 	b := new(bytes.Buffer)
204 | 	co := NewCompressor()
205 | 	co.SetWriter(b)
206 | 	d := new(Dictionary)
207 | 	assert(t, b != nil)
208 | 	assert(t, co != nil)
209 | 	assert(t, d != nil)
210 | 	s := []byte("the quick brown fox jumps over the lazy dogthe quick brown fox jumps over the lazy dogthe quick brown fox jumps over the lazy dog")
211 | 	dict := s
212 | 	d.Dict = dict
213 | 	co.SetDictionary(d)
214 | 	s1 := "the quick brown fox jumps over the lazy dogthe quick brown fox jumps over the lazy dogthe quick brown fox jumps over the lazy dog"
215 | 	s1 = s1 + "HELLO JOHN" + s1
216 | 	s = []byte(s1)
217 | 	co.Write(s)
218 | 	assert(t, co.Ratio() == 0)
219 | 	co.Close()
220 | 	assert(t, b.Len() > 0)
221 | 	assert(t, b.Len() == 19)
222 | 	assert(t, b.Bytes()[0] == 0)
223 | 	assert(t, b.Bytes()[1] == 0)
224 | 	assert(t, b.Bytes()[2] == 0x81)
225 | 	assert(t, b.Bytes()[3] == 1)
226 | 	assert(t, b.Bytes()[4] == 10)
227 | 	assert(t, bytes.Compare(b.Bytes()[5:15], []byte("HELLO JOHN")) == 0)
228 | 	assert(t, b.Bytes()[15] == 0)
229 | 	assert(t, b.Bytes()[16] == 0)
230 | 	assert(t, b.Bytes()[17] == 0x81)
231 | 	assert(t, b.Bytes()[18] == 1)
232 | 	assert(t, co.Ratio() > 0)
233 | 	assert(t, co.Ratio() == (10000*b.Len()/len(s)))
234 | 	assert(t, b.Len() > 0)
235 | 	ex := NewExpander(b, dict)
236 | 	assert(t, ex != nil)
237 | 	out := make([]byte, 0)
238 | 	o, err := ex.Expand(out)
239 | 	assert(t, err == nil)
240 | 	assert(t, len(o) == len(s))
241 | 	assert(t, bytes.Compare(o, s) == 0)
242 | }
243 | 
244 | func TestTotallyDifferent(t *testing.T) {
245 | 	b := new(bytes.Buffer)
246 | 	co := NewCompressor()
247 | 	co.SetWriter(b)
248 | 	d := new(Dictionary)
249 | 	assert(t, b != nil)
250 | 	assert(t, co != nil)
251 | 	assert(t, d != nil)
252 | 	s := []byte("the quick brown fox jumps over the lazy dogthe quick brown fox jumps over the lazy dogthe quick brown fox jumps over the lazy dog")
253 | 	dict := s
254 | 	d.Dict = dict
255 | 	co.SetDictionary(d)
256 | 	s = []byte("THE QUICK BROWN FOX JUMPS OVER THE LAZY DOGTHE QUICK BROWN FOX JUMPS OVER THE LAZY DOG THE QUICK BROWN FOX JUMPS OVER THE LAZY DOG")
257 | 	co.Write(s)
258 | 	assert(t, co.Ratio() == 0)
259 | 	co.Close()
260 | 	assert(t, b.Len() > 0)
261 | 	assert(t, b.Len() == 132)
262 | 	assert(t, b.Bytes()[0] == 0x82)
263 | 	assert(t, b.Bytes()[1] == 0x01)
264 | 	assert(t, bytes.Compare(b.Bytes()[2:], []byte(s)) == 0)
265 | 	assert(t, co.Ratio() > 0)
266 | 	assert(t, co.Ratio() == (10000*b.Len()/len(s)))
267 | 	assert(t, b.Len() > 0)
268 | 	ex := NewExpander(b, dict)
269 | 	assert(t, ex != nil)
270 | 	out := make([]byte, 0)
271 | 	o, err := ex.Expand(out)
272 | 	assert(t, err == nil)
273 | 	assert(t, len(o) == len(s))
274 | 	assert(t, bytes.Compare(o, s) == 0)
275 | }
276 | 
277 | func TestSerializeDeserialize(t *testing.T) {
278 | 	b := new(bytes.Buffer)
279 | 	co := NewCompressor()
280 | 	co.SetWriter(b)
281 | 	d := new(Dictionary)
282 | 	assert(t, b != nil)
283 | 	assert(t, co != nil)
284 | 	assert(t, d != nil)
285 | 	s := []byte("the quick brown fox jumps over the lazy dogthe quick brown fox jumps over the lazy dogthe quick brown fox jumps over the lazy dog")
286 | 	dict := s
287 | 	d.Dict = dict
288 | 	co.SetDictionary(d)
289 | 	s = []byte("THE QUICK BROWN FOX JUMPS OVER THE LAZY DOGTHE QUICK BROWN FOX JUMPS OVER THE LAZY DOG THE QUICK BROWN FOX JUMPS OVER THE LAZY DOG")
290 | 	co.Write(s)
291 | 	assert(t, co.Ratio() == 0)
292 | 	co.Close()
293 | 
294 | 	serialized, err := co.SerializeDictionary()
295 | 	assert(t, len(serialized) != 0)
296 | 	assert(t, err == nil)
297 | 
298 | 	temp := make(map[uint32]uint32)
299 | 	for k, v := range co.GetDictionary().H {
300 | 		temp[k] = v
301 | 	}
302 | 
303 | 	co.GetDictionary().H = make(map[uint32]uint32)
304 | 	assert(t, len(co.GetDictionary().H) == 0)
305 | 
306 | 	m := make(map[uint32]uint32)
307 | 	err = DeserializeDictionary(serialized, m)
308 | 	assert(t, err == nil)
309 | 
310 | 	assert(t, len(temp) != 0)
311 | 	assert(t, len(temp) == len(m))
312 | 
313 | 	for k, v := range m {
314 | 		assert(t, temp[k] == v)
315 | 	}
316 | 
317 | 	d.Dict = dict
318 | 	d.H = m
319 | 	co.SetDictionary(d)
320 | 
321 | 	assert(t, len(temp) == len(co.GetDictionary().H))
322 | 
323 | 	for k, v := range co.GetDictionary().H {
324 | 		assert(t, temp[k] == v)
325 | 	}
326 | }
327 | 


--------------------------------------------------------------------------------
/bm.go:
--------------------------------------------------------------------------------
  1 | // bm.go: an implementation of the Bentley/McIlroy compression
  2 | // technique.  See "Data Compression Using Long Common Strings", Jon
  3 | // Bentley, Douglas McIlroy, Proceedings of the IEEE Data Compression
  4 | // Conference, 1999, pp. 287-295.
  5 | //
  6 | // Copyright (c) 2012-2013 CloudFlare, Inc.
  7 | 
  8 | package bm
  9 | 
 10 | import (
 11 | 	"bytes"
 12 | 	"encoding/binary"
 13 | 	"errors"
 14 | 	"io"
 15 | )
 16 | 
 17 | // The compressor uses the Rabin/Karp algorithm to create fingerprints
 18 | // of sub-strings of the text to be compressed.
 19 | //
 20 | // Details of Rabin/Karp are in "Efficient randomized pattern-matching
 21 | // algorithms", Karp, R. M., and Rabin, M. O., IBM Journal of Research
 22 | // and Development 31, 2 March 1989, pp. 249-260.
 23 | //
 24 | // We work over bytes (rather than characters) and Rabin/Karp using
 25 | // arithmetic in base d in the ring Zp where p is prime and d is the
 26 | // nearest prime to the size of the input alphabet (since bytes d =
 27 | // 257).  For optimal storage we choose p such that pd fits as best as
 28 | // it can into 32 bits.  i.e. p is the prime nearest to
 29 | // 2^32/d. Various tricks are performed below to speed the computation
 30 | // of the hash. Notably p is not actually prime, it's a power of 2 so
 31 | // that & is used intead of %.
 32 | //
 33 | // Fingerprints are generated over a fixed block size which is defined
 34 | // here.  This is very open to experimentation and could actually be
 35 | // a parameter
 36 | //
 37 | // To make this as fast as possible the actual Rabin/Karp algorithm is
 38 | // not used, but values are picked to be powers of 2 so that slow
 39 | // operations can be made very fast.
 40 | 
 41 | const block uint32 = 50
 42 | const radix uint32 = (1 << 8) + 1
 43 | const prime uint32 = 1 << (32 - 8 - 1)
 44 | const clip uint32 = prime - 1 // Used to emulate a % operation when we
 45 | // know that prime is a power of two
 46 | 
 47 | // A Dictionary contains both the raw data being compressed against
 48 | // and the hash table built using the Rabin/Karp procedure
 49 | type Dictionary struct {
 50 | 	Dict []byte // Bytes to compress against
 51 | 	H    map[uint32]uint32
 52 | 	// Stores the mapping between block checksums and their positions
 53 | }
 54 | 
 55 | // A Compressor is a complete instance of the compressor
 56 | type Compressor struct {
 57 | 	w io.Writer // The io.Writer where compressed data will be written
 58 | 	f uint32    // The current fingerprint as we are processing
 59 | 	d []byte    // The data to be compressed.
 60 | 	l uint32    // Largest 'digit' in the radix that will be seen in the
 61 | 	// fingerprint
 62 | 	save [256]uint32
 63 | 	dict Dictionary
 64 | 
 65 | 	// Values that keep track of the size of the data that was written
 66 | 	// and the compressed output size
 67 | 
 68 | 	inSize  int
 69 | 	outSize int
 70 | }
 71 | 
 72 | // NewCompressor creates a new compressor.  The Compressor implements
 73 | // io.Writer and so calling Write() compress and writes to the actual
 74 | // output.  Note that you must call SetWriter and SetDictionary before
 75 | // doing any compression to set the output writer.
 76 | func NewCompressor() *Compressor {
 77 | 	c := Compressor{}
 78 | 	c.w = nil
 79 | 	c.f = 0
 80 | 
 81 | 	// Calculate the largest 'digit' that can be stored in the
 82 | 	// fingerprint.  It's radix^(block-1) mod prime.  Calculated in a
 83 | 	// loop to avoid an overflow when doing something like 256^100 mod
 84 | 	// 16777213.
 85 | 
 86 | 	c.l = 1
 87 | 	var i uint32
 88 | 	for i = 0; i < block-1; i++ {
 89 | 		c.l *= radix
 90 | 		c.l &= clip
 91 | 	}
 92 | 
 93 | 	for i = 0; i < 256; i++ {
 94 | 		c.save[i] = i * c.l
 95 | 	}
 96 | 
 97 | 	c.inSize = 0
 98 | 	c.outSize = 0
 99 | 
100 | 	return &c
101 | }
102 | 
103 | // SetWriter sets the writer to which the compressed output will be written.
104 | // This must be called otherwise an error will occur.
105 | func (c *Compressor) SetWriter(w io.Writer) {
106 | 	c.w = w
107 | }
108 | 
109 | // SetDictionary sets a dictionary. When a dictionary has been loaded
110 | // references are made to the dictionary (rather than internally in
111 | // the compressed data itself)
112 | func (c *Compressor) SetDictionary(dict *Dictionary) {
113 | 	c.dict.Dict = dict.Dict
114 | 
115 | 	// If the dictionary of hashes has not been computed then it must
116 | 	// be computed now
117 | 	if dict.H == nil {
118 | 		c.dict.H = make(map[uint32]uint32)
119 | 
120 | 		f := uint32(0)
121 | 		for ii := range c.dict.Dict {
122 | 			i := uint32(ii)
123 | 
124 | 			if i < block {
125 | 				f = (f*radix + uint32(c.dict.Dict[i])) & clip
126 | 			} else {
127 | 				if i%block == 0 {
128 | 					_, exists := c.dict.H[f]
129 | 					if !exists {
130 | 						c.dict.H[f] = uint32(i - block)
131 | 					}
132 | 				}
133 | 
134 | 				f = (radix*(f-c.save[c.dict.Dict[i-block]]) +
135 | 					uint32(c.dict.Dict[i])) & clip
136 | 			}
137 | 		}
138 | 	} else {
139 | 		c.dict.H = dict.H
140 | 	}
141 | }
142 | 
143 | // GetDictionary retrieves the dictionary structure for serialization
144 | func (c *Compressor) GetDictionary() *Dictionary {
145 | 	return &c.dict
146 | }
147 | 
148 | // Write implements the io.Writer interface.  To compress data Write
149 | // repeatedly and it will be compressed.  When done it is necessary to
150 | // call Close() where the actual compression occurs.
151 | func (c *Compressor) Write(p []byte) (int, error) {
152 | 	c.d = append(c.d, p...)
153 | 	n := len(p)
154 | 	c.inSize += n
155 | 	return n, nil
156 | }
157 | 
158 | // File format:
159 | //
160 | // A section of uncompressed data is written with a length value
161 | // preceding it. The length is a base 128 number (a varint) with the
162 | // same encoding Google Protocol Buffers uses.
163 | //
164 | // A compression section starts with a 0 (since no uncompressed
165 | // section can have zero length) followed by a pair of varints giving
166 | // the offset and length of the region to be copied.
167 | 
168 | // writeVarUInt: writes out a variable integer which used base 128
169 | // in the style of Google Protocol Buffers.
170 | func (c *Compressor) writeVarUint(u uint32) error {
171 | 	buf := make([]byte, 1)
172 | 
173 | 	for {
174 | 		buf[0] = byte(u & 0x7F)
175 | 		u >>= 7
176 | 
177 | 		if u != 0 {
178 | 			buf[0] |= 0x80
179 | 		}
180 | 		n, err := c.w.Write(buf)
181 | 		if err != nil {
182 | 			return err
183 | 		}
184 | 		c.outSize += n
185 | 		if u == 0 {
186 | 			break
187 | 		}
188 | 	}
189 | 	return nil
190 | }
191 | 
192 | // writeUncompressedBlock: writes out a block of uncompressed data
193 | // preceded by the length of the block as a variable length integer
194 | func (c *Compressor) writeUncompressedBlock(d []byte) error {
195 | 	if len(d) == 0 {
196 | 		return nil
197 | 	}
198 | 	if err := c.writeVarUint(uint32(len(d))); err != nil {
199 | 		return err
200 | 	}
201 | 	if n, err := c.w.Write(d); err != nil {
202 | 		return err
203 | 	} else {
204 | 		c.outSize += n
205 | 	}
206 | 	return nil
207 | }
208 | 
209 | // writeCompressedReference: writes out a block of compressed data
210 | // which simply consists of a reference to the start of a block to
211 | // copy and its length.  This is preceded by zero to indicate that
212 | // this is a block of compressed data
213 | func (c *Compressor) writeCompressedReference(start, offset uint32) error {
214 | 	zero := []byte{0}
215 | 	if n, err := c.w.Write(zero); err != nil {
216 | 		return err
217 | 	} else {
218 | 		c.outSize += n
219 | 	}
220 | 	if err := c.writeVarUint(start); err != nil {
221 | 		return err
222 | 	}
223 | 
224 | 	return c.writeVarUint(offset)
225 | 
226 | }
227 | 
228 | // Close tells the compressor that all the data has been written.
229 | // This does not close the underlying io.Writer.  This is where the
230 | // Bentley/McIlroy and Rabin/Karp algorithms are implemented.
231 | // Reference those papers for a full explanation.
232 | func (c *Compressor) Close() error {
233 | 	var skip uint32
234 | 	var last uint32
235 | 
236 | 	// This points to the slice containing the buffer used as the
237 | 	// dictionary for the compression.  This is either the data itself
238 | 	// (for self referential compression) or its the dictionary set by
239 | 	// SetDictionary
240 | 
241 | 	for x := range c.d {
242 | 		i := uint32(x)
243 | 
244 | 		// The first block bytes are consumed to calculate the
245 | 		// fingerprint of the first block
246 | 
247 | 		if i < block {
248 | 			c.f = (c.f*radix + uint32(c.d[i])) & clip
249 | 		} else {
250 | 
251 | 			// The data is broken up into non-overlapping blocks of
252 | 			// size block.  The fingerprints of each block are stored
253 | 			// in a hash table which keeps the position the first time
254 | 			// the block was seen.
255 | 			//
256 | 			// The fingerprint of the current block which covers the
257 | 			// block bytes (i-block,i] is calculated efficiently
258 | 			//
259 | 			// The canonical calculation is as follows:
260 | 			//
261 | 			// c.f = ( radix * ( c.f - c.d[i-block] * c.l ) + c.d[i] ) % prime
262 | 			//
263 | 			// But a number of tricks are performed to make this
264 | 			// faster. First, values of c.d[i-block] * c.l are kept in
265 | 			// an array so they are only calculated once. Second, the
266 | 			// modulo calculation is done using bit twiddling rather
267 | 			// than division.
268 | 
269 | 			if i >= skip {
270 | 
271 | 				// If the new fingerprint appears in the hash table
272 | 				// then this block has been seen before and can be
273 | 				// encoded rather than emitted.  First must check that
274 | 				// it's an actual match since there is a small
275 | 				// probability of the hashing algorithm used for
276 | 				// calculating fingerprints having a collision
277 | 
278 | 				e, exists := c.dict.H[c.f]
279 | 				match := false
280 | 				if exists {
281 | 					match = true
282 | 					var j uint32
283 | 					for j = 0; j < block; j++ {
284 | 						if c.dict.Dict[e+j] != c.d[i-block+j] {
285 | 							match = false
286 | 							break
287 | 						}
288 | 					}
289 | 				}
290 | 
291 | 				// If there's a match then we need to figure out how
292 | 				// far we can extend it backwards up to block-1 bytes
293 | 				// and forward as far as possible
294 | 
295 | 				if match {
296 | 					var s uint32
297 | 					for s = 1; s < block; s++ {
298 | 						if i < last+block+s {
299 | 							break
300 | 						}
301 | 
302 | 						if e < s {
303 | 							break
304 | 						}
305 | 
306 | 						if i < block+s {
307 | 							break
308 | 						}
309 | 
310 | 						if c.dict.Dict[e-s] != c.d[i-block-s] {
311 | 							break
312 | 						}
313 | 					}
314 | 					s--
315 | 
316 | 					var f uint32
317 | 					for f = 0; f < uint32(len(c.d))-i; f++ {
318 | 						if e+block+f >= uint32(len(c.dict.Dict)) {
319 | 							break
320 | 						}
321 | 
322 | 						if c.dict.Dict[e+block+f] != c.d[i+f] {
323 | 							break
324 | 						}
325 | 					}
326 | 
327 | 					if err := c.writeUncompressedBlock(c.d[last : i-block-s]); err != nil {
328 | 						return err
329 | 					}
330 | 					if err := c.writeCompressedReference(e-s, block+s+f); err != nil {
331 | 						return err
332 | 					}
333 | 					skip = i + f + block + 1
334 | 					last = i + f
335 | 				}
336 | 
337 | 			}
338 | 
339 | 			c.f = ((c.f-c.save[c.d[i-block]])*radix + uint32(c.d[i])) & clip
340 | 		}
341 | 	}
342 | 
343 | 	if last < uint32(len(c.d)) {
344 | 		return c.writeUncompressedBlock(c.d[last:])
345 | 	}
346 | 
347 | 	return nil
348 | }
349 | 
350 | // Ratio retrieves the compression ratio of the last compression
351 | // performed. Only makes sense after Close() has been called. The
352 | // returned value is an integer representing the size of the output as
353 | // a percentage of the input * 100. If the return value is -1 then it
354 | // indicates that there was no input.
355 | func (c *Compressor) Ratio() int {
356 | 	if c.inSize > 0 {
357 | 		return (10000 * c.outSize) / c.inSize
358 | 	}
359 | 	return -1
360 | }
361 | 
362 | // Get the size in bytes of the last compressed output. Only makes
363 | // sense after Close() has been called.
364 | func (c *Compressor) CompressedSize() int {
365 | 	return c.outSize
366 | }
367 | 
368 | // Get the size in bytes of the last uncompressed input. Only makes
369 | // sense after Close() has been called.
370 | func (c *Compressor) InputSize() int {
371 | 	return c.inSize
372 | }
373 | 
374 | // SerializeDictionary turns H (the map part of the Dictionary) into a
375 | // []byte for easy storage in memcached or elsewhere.
376 | func (c *Compressor) SerializeDictionary() ([]byte, error) {
377 | 	if len(c.dict.H) > 0 {
378 | 
379 | 		// This reserves enough space in o to store the entire map
380 | 		// with the worse case encoding. The worse case encoding is
381 | 		// likely to be used for the map keys (because they are
382 | 		// hashes) but not for the offset values which will in general
383 | 		// be small.
384 | 
385 | 		buf := bytes.NewBuffer(make([]byte, 0,
386 | 			len(c.dict.H)*2*binary.MaxVarintLen32))
387 | 
388 | 		for k, v := range c.dict.H {
389 | 			if err := binary.Write(buf, binary.LittleEndian, k); err != nil {
390 | 				return nil, err
391 | 			}
392 | 			if err := binary.Write(buf, binary.LittleEndian, v); err != nil {
393 | 				return nil, err
394 | 			}
395 | 		}
396 | 
397 | 		return buf.Bytes(), nil
398 | 	}
399 | 
400 | 	return []byte{}, nil
401 | }
402 | 
403 | // DeserializeDictionary reads the H part of the Dictionary from a
404 | // []byte previously created with SerializeDictionary
405 | func DeserializeDictionary(o []byte, m map[uint32]uint32) error {
406 | 	buf := bytes.NewBuffer(o)
407 | 
408 | 	for buf.Len() > 0 {
409 | 		var k uint32
410 | 
411 | 		if err := binary.Read(buf, binary.LittleEndian, &k); err != nil {
412 | 			return err
413 | 		}
414 | 		var v uint32
415 | 		if err := binary.Read(buf, binary.LittleEndian, &v); err != nil {
416 | 			return err
417 | 		}
418 | 		m[k] = v
419 | 	}
420 | 
421 | 	return nil
422 | }
423 | 
424 | // An Expander is the complete state of the expander returned by NewExpander
425 | type Expander struct {
426 | 	r io.Reader // The io.Reader from which the raw compressed data
427 | 	// is read
428 | 	d []byte // Data read from r is stored here so that
429 | 	// compressed back references can be handled
430 | 	to   int    // Position in d to which the caller has read
431 | 	dict []byte // Dictionary to decompress against, if set then
432 | 	// decompression is done referncing this.  If not
433 | 	// then references are internal.
434 | }
435 | 
436 | // NewExpander creates a new decompressor.  Pass in an io.Reader that
437 | // can be used to read the raw compressed data.  The Expander
438 | // implements io.Reader and so calling Read() decompress data and
439 | // reads the actual input.
440 | func NewExpander(r io.Reader, dict []byte) *Expander {
441 | 	e := Expander{}
442 | 	e.r = r
443 | 	e.to = 0
444 | 	e.dict = dict
445 | 	return &e
446 | }
447 | 
448 | // readVarUint: since the compressed data consists of varints (see
449 | // bmcompress.go) for details then the fundamental operation is
450 | // reading varints
451 | func (e *Expander) readVarUint() (uint, error) {
452 | 	u := uint(0)
453 | 	b := make([]byte, 1)
454 | 	m := uint(1)
455 | 	for {
456 | 		if n, err := e.r.Read(b); n != 1 || err != nil {
457 | 			return 0, err
458 | 		}
459 | 
460 | 		u += m * uint(b[0]&byte(0x7F))
461 | 		m *= 128
462 | 
463 | 		if b[0] < 128 {
464 | 			break
465 | 		}
466 | 	}
467 | 
468 | 	return u, nil
469 | }
470 | 
471 | // Expand expands the compressed data into a buffer
472 | func (e *Expander) Expand(p []byte) (q []byte, err error) {
473 | 
474 | 	// This is done to capture the extreme case that an out of
475 | 	// bounds error occurs in the expansion. This should never
476 | 	// happen, but this protects against a corrupt compressed
477 | 	// block. If this occurs then we return no data at all to
478 | 	// prevent any bad data being returned to the client.
479 | 
480 | 	defer func() {
481 | 		if x := recover(); x != nil {
482 | 			err = errors.New("panic caught inside expander")
483 | 			p = p[:0]
484 | 		}
485 | 	}()
486 | 
487 | 	q = p
488 | A:
489 | 	for {
490 | 		var u uint
491 | 		if u, err = e.readVarUint(); err != nil {
492 | 			break A
493 | 		}
494 | 
495 | 		// If the value read is zero then it indicates a compressed
496 | 		// section which is formed of two varints indicating the
497 | 		// offset and length, if not then it's an uncompressed section
498 | 
499 | 		if u == 0 {
500 | 			var offset uint
501 | 			if offset, err = e.readVarUint(); err != nil {
502 | 				break A
503 | 			}
504 | 
505 | 			var length uint
506 | 			if length, err = e.readVarUint(); err != nil {
507 | 				break A
508 | 			}
509 | 
510 | 			q = append(q, e.dict[offset:offset+length]...)
511 | 		} else {
512 | 			left := u
513 | 			for left > 0 {
514 | 				buf := make([]byte, left)
515 | 				var n int
516 | 				if n, err = e.r.Read(buf); err != nil {
517 | 					break A
518 | 				}
519 | 				if n > 0 {
520 | 					left -= uint(n)
521 | 					q = append(q, buf[0:n]...)
522 | 				} else {
523 | 					break A
524 | 				}
525 | 			}
526 | 		}
527 | 	}
528 | 
529 | 	if err == io.EOF {
530 | 		err = nil
531 | 	}
532 | 
533 | 	return
534 | }
535 | 


--------------------------------------------------------------------------------