├── README.md ├── Makefile ├── .gitignore ├── .github └── workflows │ └── semgrep.yml ├── LICENSE ├── bm_test.go └── bm.go /README.md: -------------------------------------------------------------------------------- 1 | bm 2 | == 3 | 4 | A Golang implementation of the Bentley/McIlroy compression algorithm 5 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | GCFLAGS := -B 2 | LDFLAGS := 3 | 4 | .PHONY: install 5 | install: 6 | @go install -v . 7 | 8 | .PHONY: test 9 | test: 10 | @go test -gcflags='$(GCFLAGS)' -ldflags='$(LDFLAGS)' . 11 | 12 | .PHONY: bench 13 | bench: 14 | @go test -gcflags='$(GCFLAGS)' -ldflags='$(LDFLAGS)' -bench . 15 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects) 2 | *.o 3 | *.a 4 | *.so 5 | *~ 6 | 7 | # Folders 8 | _obj 9 | _test 10 | 11 | # Architecture specific extensions/prefixes 12 | *.[568vq] 13 | [568vq].out 14 | 15 | *.cgo1.go 16 | *.cgo2.c 17 | _cgo_defun.c 18 | _cgo_gotypes.go 19 | _cgo_export.* 20 | 21 | _testmain.go 22 | 23 | *.exe 24 | -------------------------------------------------------------------------------- /.github/workflows/semgrep.yml: -------------------------------------------------------------------------------- 1 | 2 | on: 3 | pull_request: {} 4 | workflow_dispatch: {} 5 | push: 6 | branches: 7 | - main 8 | - master 9 | name: Semgrep config 10 | jobs: 11 | semgrep: 12 | name: semgrep/ci 13 | runs-on: ubuntu-20.04 14 | env: 15 | SEMGREP_APP_TOKEN: ${{ secrets.SEMGREP_APP_TOKEN }} 16 | SEMGREP_URL: https://cloudflare.semgrep.dev 17 | SEMGREP_APP_URL: https://cloudflare.semgrep.dev 18 | SEMGREP_VERSION_CHECK_URL: https://cloudflare.semgrep.dev/api/check-version 19 | container: 20 | image: returntocorp/semgrep 21 | steps: 22 | - uses: actions/checkout@v3 23 | - run: semgrep ci 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2013 CloudFlare, Inc. 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, 5 | are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, this 11 | list of conditions and the following disclaimer in the documentation and/or 12 | other materials provided with the distribution. 13 | 14 | * Neither the name of the CloudFlare, Inc. nor the names of its 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 22 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 23 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 24 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 25 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /bm_test.go: -------------------------------------------------------------------------------- 1 | // bm_test.go: test suite for bmcompress 2 | // 3 | // Copyright (c) 2013 CloudFlare, Inc. 4 | 5 | package bm 6 | 7 | import ( 8 | "bytes" 9 | "testing" 10 | ) 11 | 12 | func assert(t *testing.T, b bool) { 13 | if !b { 14 | t.Fail() 15 | } 16 | } 17 | 18 | func TestCreateCompressor(t *testing.T) { 19 | b := new(bytes.Buffer) 20 | co := NewCompressor() 21 | co.SetWriter(b) 22 | d := new(Dictionary) 23 | assert(t, co.Ratio() == -1) 24 | assert(t, b != nil) 25 | assert(t, co != nil) 26 | assert(t, d != nil) 27 | } 28 | 29 | func TestSelfCompressAndExpand(t *testing.T) { 30 | b := new(bytes.Buffer) 31 | co := NewCompressor() 32 | co.SetWriter(b) 33 | d := new(Dictionary) 34 | assert(t, b != nil) 35 | assert(t, co != nil) 36 | assert(t, d != nil) 37 | s := []byte("the quick brown fox jumps over the lazy dogthe quick brown fox jumps over the lazy dogthe quick brown fox jumps over the lazy dog") 38 | dict := s 39 | d.Dict = dict 40 | co.SetDictionary(d) 41 | co.Write(s) 42 | assert(t, co.Ratio() == 0) 43 | co.Close() 44 | assert(t, b.Len() > 0) 45 | assert(t, b.Len() == 4) 46 | assert(t, b.Bytes()[0] == 0) 47 | assert(t, b.Bytes()[1] == 0) 48 | assert(t, b.Bytes()[2] == 0x81) 49 | assert(t, b.Bytes()[3] == 1) 50 | assert(t, co.Ratio() > 0) 51 | assert(t, co.Ratio() == (10000*b.Len()/len(s))) 52 | ex := NewExpander(b, dict) 53 | assert(t, ex != nil) 54 | out := make([]byte, 0) 55 | o, err := ex.Expand(out) 56 | assert(t, err == nil) 57 | assert(t, len(o) == len(s)) 58 | assert(t, bytes.Compare(o, s) == 0) 59 | } 60 | 61 | func TestExtraAtEnd(t *testing.T) { 62 | b := new(bytes.Buffer) 63 | co := NewCompressor() 64 | co.SetWriter(b) 65 | d := new(Dictionary) 66 | assert(t, b != nil) 67 | assert(t, co != nil) 68 | assert(t, d != nil) 69 | s := []byte("the quick brown fox jumps over the lazy dogthe quick brown fox jumps over the lazy dogthe quick brown fox jumps over the lazy dog") 70 | dict := s 71 | d.Dict = dict 72 | co.SetDictionary(d) 73 | s = []byte("the quick brown fox jumps over the lazy dogthe quick brown fox jumps over the lazy dogthe quick brown fox jumps over the lazy dogDOG") 74 | co.Write(s) 75 | assert(t, co.Ratio() == 0) 76 | co.Close() 77 | assert(t, b.Len() > 0) 78 | assert(t, b.Len() == 8) 79 | assert(t, b.Bytes()[0] == 0) 80 | assert(t, b.Bytes()[1] == 0) 81 | assert(t, b.Bytes()[2] == 0x81) 82 | assert(t, b.Bytes()[3] == 1) 83 | assert(t, b.Bytes()[4] == 3) 84 | assert(t, co.Ratio() > 0) 85 | assert(t, co.Ratio() == (10000*b.Len()/len(s))) 86 | assert(t, bytes.Compare(b.Bytes()[5:8], []byte("DOG")) == 0) 87 | ex := NewExpander(b, dict) 88 | assert(t, ex != nil) 89 | out := make([]byte, 0) 90 | o, err := ex.Expand(out) 91 | assert(t, err == nil) 92 | assert(t, len(o) == len(s)) 93 | assert(t, bytes.Compare(o, s) == 0) 94 | } 95 | 96 | func TestExtraAtStart(t *testing.T) { 97 | b := new(bytes.Buffer) 98 | co := NewCompressor() 99 | co.SetWriter(b) 100 | d := new(Dictionary) 101 | assert(t, b != nil) 102 | assert(t, co != nil) 103 | assert(t, d != nil) 104 | s := []byte("the quick brown fox jumps over the lazy dogthe quick brown fox jumps over the lazy dogthe quick brown fox jumps over the lazy dog") 105 | dict := s 106 | d.Dict = dict 107 | co.SetDictionary(d) 108 | s = []byte("THEthe quick brown fox jumps over the lazy dogthe quick brown fox jumps over the lazy dogthe quick brown fox jumps over the lazy dog") 109 | co.Write(s) 110 | assert(t, co.Ratio() == 0) 111 | co.Close() 112 | assert(t, b.Len() > 0) 113 | assert(t, b.Len() == 8) 114 | assert(t, b.Bytes()[0] == 3) 115 | assert(t, bytes.Compare(b.Bytes()[1:4], []byte("THE")) == 0) 116 | assert(t, b.Bytes()[4] == 0) 117 | assert(t, b.Bytes()[5] == 0) 118 | assert(t, b.Bytes()[6] == 0x81) 119 | assert(t, b.Bytes()[7] == 1) 120 | assert(t, co.Ratio() > 0) 121 | assert(t, co.Ratio() == (10000*b.Len()/len(s))) 122 | ex := NewExpander(b, dict) 123 | assert(t, ex != nil) 124 | out := make([]byte, 0) 125 | o, err := ex.Expand(out) 126 | assert(t, err == nil) 127 | assert(t, len(o) == len(s)) 128 | assert(t, bytes.Compare(o, s) == 0) 129 | } 130 | 131 | func TestExtraAtStartAndEnd(t *testing.T) { 132 | b := new(bytes.Buffer) 133 | co := NewCompressor() 134 | co.SetWriter(b) 135 | d := new(Dictionary) 136 | assert(t, b != nil) 137 | assert(t, co != nil) 138 | assert(t, d != nil) 139 | s := []byte("the quick brown fox jumps over the lazy dogthe quick brown fox jumps over the lazy dogthe quick brown fox jumps over the lazy dog") 140 | dict := s 141 | d.Dict = dict 142 | co.SetDictionary(d) 143 | s = []byte("THEthe quick brown fox jumps over the lazy dogthe quick brown fox jumps over the lazy dogthe quick brown fox jumps over the lazy dogDOG") 144 | co.Write(s) 145 | assert(t, co.Ratio() == 0) 146 | co.Close() 147 | assert(t, b.Len() > 0) 148 | assert(t, b.Len() == 12) 149 | assert(t, b.Bytes()[0] == 3) 150 | assert(t, bytes.Compare(b.Bytes()[1:4], []byte("THE")) == 0) 151 | assert(t, b.Bytes()[4] == 0) 152 | assert(t, b.Bytes()[5] == 0) 153 | assert(t, b.Bytes()[6] == 0x81) 154 | assert(t, b.Bytes()[7] == 1) 155 | assert(t, b.Bytes()[8] == 3) 156 | assert(t, bytes.Compare(b.Bytes()[9:12], []byte("DOG")) == 0) 157 | assert(t, co.Ratio() > 0) 158 | assert(t, co.Ratio() == (10000*b.Len()/len(s))) 159 | ex := NewExpander(b, dict) 160 | assert(t, ex != nil) 161 | out := make([]byte, 0) 162 | o, err := ex.Expand(out) 163 | assert(t, err == nil) 164 | assert(t, len(o) == len(s)) 165 | assert(t, bytes.Compare(o, s) == 0) 166 | } 167 | 168 | func TestNotQuite(t *testing.T) { 169 | b := new(bytes.Buffer) 170 | co := NewCompressor() 171 | co.SetWriter(b) 172 | d := new(Dictionary) 173 | assert(t, b != nil) 174 | assert(t, co != nil) 175 | assert(t, d != nil) 176 | s := []byte("the quick brown fox jumps over the lazy dogthe quick brown fox jumps over the lazy dogthe quick brown fox jumps over the lazy dog") 177 | dict := s 178 | d.Dict = dict 179 | co.SetDictionary(d) 180 | s = []byte("the quick brown fox jumps over the lazy dogthe quick brown fox jumps over the lazy dog the quick brown fox jumps over the lazy dog") 181 | co.Write(s) 182 | assert(t, co.Ratio() == 0) 183 | co.Close() 184 | assert(t, b.Len() > 0) 185 | assert(t, b.Len() == 48) 186 | assert(t, b.Bytes()[0] == 0) 187 | assert(t, b.Bytes()[1] == 0) 188 | assert(t, b.Bytes()[2] == 0x56) 189 | assert(t, b.Bytes()[3] == 0x2c) 190 | assert(t, bytes.Compare(b.Bytes()[4:], []byte(" the quick brown fox jumps over the lazy dog")) == 0) 191 | assert(t, co.Ratio() > 0) 192 | assert(t, co.Ratio() == (10000*b.Len()/len(s))) 193 | ex := NewExpander(b, dict) 194 | assert(t, ex != nil) 195 | out := make([]byte, 0) 196 | o, err := ex.Expand(out) 197 | assert(t, err == nil) 198 | assert(t, len(o) == len(s)) 199 | assert(t, bytes.Compare(o, s) == 0) 200 | } 201 | 202 | func TestBitInTheMiddle(t *testing.T) { 203 | b := new(bytes.Buffer) 204 | co := NewCompressor() 205 | co.SetWriter(b) 206 | d := new(Dictionary) 207 | assert(t, b != nil) 208 | assert(t, co != nil) 209 | assert(t, d != nil) 210 | s := []byte("the quick brown fox jumps over the lazy dogthe quick brown fox jumps over the lazy dogthe quick brown fox jumps over the lazy dog") 211 | dict := s 212 | d.Dict = dict 213 | co.SetDictionary(d) 214 | s1 := "the quick brown fox jumps over the lazy dogthe quick brown fox jumps over the lazy dogthe quick brown fox jumps over the lazy dog" 215 | s1 = s1 + "HELLO JOHN" + s1 216 | s = []byte(s1) 217 | co.Write(s) 218 | assert(t, co.Ratio() == 0) 219 | co.Close() 220 | assert(t, b.Len() > 0) 221 | assert(t, b.Len() == 19) 222 | assert(t, b.Bytes()[0] == 0) 223 | assert(t, b.Bytes()[1] == 0) 224 | assert(t, b.Bytes()[2] == 0x81) 225 | assert(t, b.Bytes()[3] == 1) 226 | assert(t, b.Bytes()[4] == 10) 227 | assert(t, bytes.Compare(b.Bytes()[5:15], []byte("HELLO JOHN")) == 0) 228 | assert(t, b.Bytes()[15] == 0) 229 | assert(t, b.Bytes()[16] == 0) 230 | assert(t, b.Bytes()[17] == 0x81) 231 | assert(t, b.Bytes()[18] == 1) 232 | assert(t, co.Ratio() > 0) 233 | assert(t, co.Ratio() == (10000*b.Len()/len(s))) 234 | assert(t, b.Len() > 0) 235 | ex := NewExpander(b, dict) 236 | assert(t, ex != nil) 237 | out := make([]byte, 0) 238 | o, err := ex.Expand(out) 239 | assert(t, err == nil) 240 | assert(t, len(o) == len(s)) 241 | assert(t, bytes.Compare(o, s) == 0) 242 | } 243 | 244 | func TestTotallyDifferent(t *testing.T) { 245 | b := new(bytes.Buffer) 246 | co := NewCompressor() 247 | co.SetWriter(b) 248 | d := new(Dictionary) 249 | assert(t, b != nil) 250 | assert(t, co != nil) 251 | assert(t, d != nil) 252 | s := []byte("the quick brown fox jumps over the lazy dogthe quick brown fox jumps over the lazy dogthe quick brown fox jumps over the lazy dog") 253 | dict := s 254 | d.Dict = dict 255 | co.SetDictionary(d) 256 | s = []byte("THE QUICK BROWN FOX JUMPS OVER THE LAZY DOGTHE QUICK BROWN FOX JUMPS OVER THE LAZY DOG THE QUICK BROWN FOX JUMPS OVER THE LAZY DOG") 257 | co.Write(s) 258 | assert(t, co.Ratio() == 0) 259 | co.Close() 260 | assert(t, b.Len() > 0) 261 | assert(t, b.Len() == 132) 262 | assert(t, b.Bytes()[0] == 0x82) 263 | assert(t, b.Bytes()[1] == 0x01) 264 | assert(t, bytes.Compare(b.Bytes()[2:], []byte(s)) == 0) 265 | assert(t, co.Ratio() > 0) 266 | assert(t, co.Ratio() == (10000*b.Len()/len(s))) 267 | assert(t, b.Len() > 0) 268 | ex := NewExpander(b, dict) 269 | assert(t, ex != nil) 270 | out := make([]byte, 0) 271 | o, err := ex.Expand(out) 272 | assert(t, err == nil) 273 | assert(t, len(o) == len(s)) 274 | assert(t, bytes.Compare(o, s) == 0) 275 | } 276 | 277 | func TestSerializeDeserialize(t *testing.T) { 278 | b := new(bytes.Buffer) 279 | co := NewCompressor() 280 | co.SetWriter(b) 281 | d := new(Dictionary) 282 | assert(t, b != nil) 283 | assert(t, co != nil) 284 | assert(t, d != nil) 285 | s := []byte("the quick brown fox jumps over the lazy dogthe quick brown fox jumps over the lazy dogthe quick brown fox jumps over the lazy dog") 286 | dict := s 287 | d.Dict = dict 288 | co.SetDictionary(d) 289 | s = []byte("THE QUICK BROWN FOX JUMPS OVER THE LAZY DOGTHE QUICK BROWN FOX JUMPS OVER THE LAZY DOG THE QUICK BROWN FOX JUMPS OVER THE LAZY DOG") 290 | co.Write(s) 291 | assert(t, co.Ratio() == 0) 292 | co.Close() 293 | 294 | serialized, err := co.SerializeDictionary() 295 | assert(t, len(serialized) != 0) 296 | assert(t, err == nil) 297 | 298 | temp := make(map[uint32]uint32) 299 | for k, v := range co.GetDictionary().H { 300 | temp[k] = v 301 | } 302 | 303 | co.GetDictionary().H = make(map[uint32]uint32) 304 | assert(t, len(co.GetDictionary().H) == 0) 305 | 306 | m := make(map[uint32]uint32) 307 | err = DeserializeDictionary(serialized, m) 308 | assert(t, err == nil) 309 | 310 | assert(t, len(temp) != 0) 311 | assert(t, len(temp) == len(m)) 312 | 313 | for k, v := range m { 314 | assert(t, temp[k] == v) 315 | } 316 | 317 | d.Dict = dict 318 | d.H = m 319 | co.SetDictionary(d) 320 | 321 | assert(t, len(temp) == len(co.GetDictionary().H)) 322 | 323 | for k, v := range co.GetDictionary().H { 324 | assert(t, temp[k] == v) 325 | } 326 | } 327 | -------------------------------------------------------------------------------- /bm.go: -------------------------------------------------------------------------------- 1 | // bm.go: an implementation of the Bentley/McIlroy compression 2 | // technique. See "Data Compression Using Long Common Strings", Jon 3 | // Bentley, Douglas McIlroy, Proceedings of the IEEE Data Compression 4 | // Conference, 1999, pp. 287-295. 5 | // 6 | // Copyright (c) 2012-2013 CloudFlare, Inc. 7 | 8 | package bm 9 | 10 | import ( 11 | "bytes" 12 | "encoding/binary" 13 | "errors" 14 | "io" 15 | ) 16 | 17 | // The compressor uses the Rabin/Karp algorithm to create fingerprints 18 | // of sub-strings of the text to be compressed. 19 | // 20 | // Details of Rabin/Karp are in "Efficient randomized pattern-matching 21 | // algorithms", Karp, R. M., and Rabin, M. O., IBM Journal of Research 22 | // and Development 31, 2 March 1989, pp. 249-260. 23 | // 24 | // We work over bytes (rather than characters) and Rabin/Karp using 25 | // arithmetic in base d in the ring Zp where p is prime and d is the 26 | // nearest prime to the size of the input alphabet (since bytes d = 27 | // 257). For optimal storage we choose p such that pd fits as best as 28 | // it can into 32 bits. i.e. p is the prime nearest to 29 | // 2^32/d. Various tricks are performed below to speed the computation 30 | // of the hash. Notably p is not actually prime, it's a power of 2 so 31 | // that & is used intead of %. 32 | // 33 | // Fingerprints are generated over a fixed block size which is defined 34 | // here. This is very open to experimentation and could actually be 35 | // a parameter 36 | // 37 | // To make this as fast as possible the actual Rabin/Karp algorithm is 38 | // not used, but values are picked to be powers of 2 so that slow 39 | // operations can be made very fast. 40 | 41 | const block uint32 = 50 42 | const radix uint32 = (1 << 8) + 1 43 | const prime uint32 = 1 << (32 - 8 - 1) 44 | const clip uint32 = prime - 1 // Used to emulate a % operation when we 45 | // know that prime is a power of two 46 | 47 | // A Dictionary contains both the raw data being compressed against 48 | // and the hash table built using the Rabin/Karp procedure 49 | type Dictionary struct { 50 | Dict []byte // Bytes to compress against 51 | H map[uint32]uint32 52 | // Stores the mapping between block checksums and their positions 53 | } 54 | 55 | // A Compressor is a complete instance of the compressor 56 | type Compressor struct { 57 | w io.Writer // The io.Writer where compressed data will be written 58 | f uint32 // The current fingerprint as we are processing 59 | d []byte // The data to be compressed. 60 | l uint32 // Largest 'digit' in the radix that will be seen in the 61 | // fingerprint 62 | save [256]uint32 63 | dict Dictionary 64 | 65 | // Values that keep track of the size of the data that was written 66 | // and the compressed output size 67 | 68 | inSize int 69 | outSize int 70 | } 71 | 72 | // NewCompressor creates a new compressor. The Compressor implements 73 | // io.Writer and so calling Write() compress and writes to the actual 74 | // output. Note that you must call SetWriter and SetDictionary before 75 | // doing any compression to set the output writer. 76 | func NewCompressor() *Compressor { 77 | c := Compressor{} 78 | c.w = nil 79 | c.f = 0 80 | 81 | // Calculate the largest 'digit' that can be stored in the 82 | // fingerprint. It's radix^(block-1) mod prime. Calculated in a 83 | // loop to avoid an overflow when doing something like 256^100 mod 84 | // 16777213. 85 | 86 | c.l = 1 87 | var i uint32 88 | for i = 0; i < block-1; i++ { 89 | c.l *= radix 90 | c.l &= clip 91 | } 92 | 93 | for i = 0; i < 256; i++ { 94 | c.save[i] = i * c.l 95 | } 96 | 97 | c.inSize = 0 98 | c.outSize = 0 99 | 100 | return &c 101 | } 102 | 103 | // SetWriter sets the writer to which the compressed output will be written. 104 | // This must be called otherwise an error will occur. 105 | func (c *Compressor) SetWriter(w io.Writer) { 106 | c.w = w 107 | } 108 | 109 | // SetDictionary sets a dictionary. When a dictionary has been loaded 110 | // references are made to the dictionary (rather than internally in 111 | // the compressed data itself) 112 | func (c *Compressor) SetDictionary(dict *Dictionary) { 113 | c.dict.Dict = dict.Dict 114 | 115 | // If the dictionary of hashes has not been computed then it must 116 | // be computed now 117 | if dict.H == nil { 118 | c.dict.H = make(map[uint32]uint32) 119 | 120 | f := uint32(0) 121 | for ii := range c.dict.Dict { 122 | i := uint32(ii) 123 | 124 | if i < block { 125 | f = (f*radix + uint32(c.dict.Dict[i])) & clip 126 | } else { 127 | if i%block == 0 { 128 | _, exists := c.dict.H[f] 129 | if !exists { 130 | c.dict.H[f] = uint32(i - block) 131 | } 132 | } 133 | 134 | f = (radix*(f-c.save[c.dict.Dict[i-block]]) + 135 | uint32(c.dict.Dict[i])) & clip 136 | } 137 | } 138 | } else { 139 | c.dict.H = dict.H 140 | } 141 | } 142 | 143 | // GetDictionary retrieves the dictionary structure for serialization 144 | func (c *Compressor) GetDictionary() *Dictionary { 145 | return &c.dict 146 | } 147 | 148 | // Write implements the io.Writer interface. To compress data Write 149 | // repeatedly and it will be compressed. When done it is necessary to 150 | // call Close() where the actual compression occurs. 151 | func (c *Compressor) Write(p []byte) (int, error) { 152 | c.d = append(c.d, p...) 153 | n := len(p) 154 | c.inSize += n 155 | return n, nil 156 | } 157 | 158 | // File format: 159 | // 160 | // A section of uncompressed data is written with a length value 161 | // preceding it. The length is a base 128 number (a varint) with the 162 | // same encoding Google Protocol Buffers uses. 163 | // 164 | // A compression section starts with a 0 (since no uncompressed 165 | // section can have zero length) followed by a pair of varints giving 166 | // the offset and length of the region to be copied. 167 | 168 | // writeVarUInt: writes out a variable integer which used base 128 169 | // in the style of Google Protocol Buffers. 170 | func (c *Compressor) writeVarUint(u uint32) error { 171 | buf := make([]byte, 1) 172 | 173 | for { 174 | buf[0] = byte(u & 0x7F) 175 | u >>= 7 176 | 177 | if u != 0 { 178 | buf[0] |= 0x80 179 | } 180 | n, err := c.w.Write(buf) 181 | if err != nil { 182 | return err 183 | } 184 | c.outSize += n 185 | if u == 0 { 186 | break 187 | } 188 | } 189 | return nil 190 | } 191 | 192 | // writeUncompressedBlock: writes out a block of uncompressed data 193 | // preceded by the length of the block as a variable length integer 194 | func (c *Compressor) writeUncompressedBlock(d []byte) error { 195 | if len(d) == 0 { 196 | return nil 197 | } 198 | if err := c.writeVarUint(uint32(len(d))); err != nil { 199 | return err 200 | } 201 | if n, err := c.w.Write(d); err != nil { 202 | return err 203 | } else { 204 | c.outSize += n 205 | } 206 | return nil 207 | } 208 | 209 | // writeCompressedReference: writes out a block of compressed data 210 | // which simply consists of a reference to the start of a block to 211 | // copy and its length. This is preceded by zero to indicate that 212 | // this is a block of compressed data 213 | func (c *Compressor) writeCompressedReference(start, offset uint32) error { 214 | zero := []byte{0} 215 | if n, err := c.w.Write(zero); err != nil { 216 | return err 217 | } else { 218 | c.outSize += n 219 | } 220 | if err := c.writeVarUint(start); err != nil { 221 | return err 222 | } 223 | 224 | return c.writeVarUint(offset) 225 | 226 | } 227 | 228 | // Close tells the compressor that all the data has been written. 229 | // This does not close the underlying io.Writer. This is where the 230 | // Bentley/McIlroy and Rabin/Karp algorithms are implemented. 231 | // Reference those papers for a full explanation. 232 | func (c *Compressor) Close() error { 233 | var skip uint32 234 | var last uint32 235 | 236 | // This points to the slice containing the buffer used as the 237 | // dictionary for the compression. This is either the data itself 238 | // (for self referential compression) or its the dictionary set by 239 | // SetDictionary 240 | 241 | for x := range c.d { 242 | i := uint32(x) 243 | 244 | // The first block bytes are consumed to calculate the 245 | // fingerprint of the first block 246 | 247 | if i < block { 248 | c.f = (c.f*radix + uint32(c.d[i])) & clip 249 | } else { 250 | 251 | // The data is broken up into non-overlapping blocks of 252 | // size block. The fingerprints of each block are stored 253 | // in a hash table which keeps the position the first time 254 | // the block was seen. 255 | // 256 | // The fingerprint of the current block which covers the 257 | // block bytes (i-block,i] is calculated efficiently 258 | // 259 | // The canonical calculation is as follows: 260 | // 261 | // c.f = ( radix * ( c.f - c.d[i-block] * c.l ) + c.d[i] ) % prime 262 | // 263 | // But a number of tricks are performed to make this 264 | // faster. First, values of c.d[i-block] * c.l are kept in 265 | // an array so they are only calculated once. Second, the 266 | // modulo calculation is done using bit twiddling rather 267 | // than division. 268 | 269 | if i >= skip { 270 | 271 | // If the new fingerprint appears in the hash table 272 | // then this block has been seen before and can be 273 | // encoded rather than emitted. First must check that 274 | // it's an actual match since there is a small 275 | // probability of the hashing algorithm used for 276 | // calculating fingerprints having a collision 277 | 278 | e, exists := c.dict.H[c.f] 279 | match := false 280 | if exists { 281 | match = true 282 | var j uint32 283 | for j = 0; j < block; j++ { 284 | if c.dict.Dict[e+j] != c.d[i-block+j] { 285 | match = false 286 | break 287 | } 288 | } 289 | } 290 | 291 | // If there's a match then we need to figure out how 292 | // far we can extend it backwards up to block-1 bytes 293 | // and forward as far as possible 294 | 295 | if match { 296 | var s uint32 297 | for s = 1; s < block; s++ { 298 | if i < last+block+s { 299 | break 300 | } 301 | 302 | if e < s { 303 | break 304 | } 305 | 306 | if i < block+s { 307 | break 308 | } 309 | 310 | if c.dict.Dict[e-s] != c.d[i-block-s] { 311 | break 312 | } 313 | } 314 | s-- 315 | 316 | var f uint32 317 | for f = 0; f < uint32(len(c.d))-i; f++ { 318 | if e+block+f >= uint32(len(c.dict.Dict)) { 319 | break 320 | } 321 | 322 | if c.dict.Dict[e+block+f] != c.d[i+f] { 323 | break 324 | } 325 | } 326 | 327 | if err := c.writeUncompressedBlock(c.d[last : i-block-s]); err != nil { 328 | return err 329 | } 330 | if err := c.writeCompressedReference(e-s, block+s+f); err != nil { 331 | return err 332 | } 333 | skip = i + f + block + 1 334 | last = i + f 335 | } 336 | 337 | } 338 | 339 | c.f = ((c.f-c.save[c.d[i-block]])*radix + uint32(c.d[i])) & clip 340 | } 341 | } 342 | 343 | if last < uint32(len(c.d)) { 344 | return c.writeUncompressedBlock(c.d[last:]) 345 | } 346 | 347 | return nil 348 | } 349 | 350 | // Ratio retrieves the compression ratio of the last compression 351 | // performed. Only makes sense after Close() has been called. The 352 | // returned value is an integer representing the size of the output as 353 | // a percentage of the input * 100. If the return value is -1 then it 354 | // indicates that there was no input. 355 | func (c *Compressor) Ratio() int { 356 | if c.inSize > 0 { 357 | return (10000 * c.outSize) / c.inSize 358 | } 359 | return -1 360 | } 361 | 362 | // Get the size in bytes of the last compressed output. Only makes 363 | // sense after Close() has been called. 364 | func (c *Compressor) CompressedSize() int { 365 | return c.outSize 366 | } 367 | 368 | // Get the size in bytes of the last uncompressed input. Only makes 369 | // sense after Close() has been called. 370 | func (c *Compressor) InputSize() int { 371 | return c.inSize 372 | } 373 | 374 | // SerializeDictionary turns H (the map part of the Dictionary) into a 375 | // []byte for easy storage in memcached or elsewhere. 376 | func (c *Compressor) SerializeDictionary() ([]byte, error) { 377 | if len(c.dict.H) > 0 { 378 | 379 | // This reserves enough space in o to store the entire map 380 | // with the worse case encoding. The worse case encoding is 381 | // likely to be used for the map keys (because they are 382 | // hashes) but not for the offset values which will in general 383 | // be small. 384 | 385 | buf := bytes.NewBuffer(make([]byte, 0, 386 | len(c.dict.H)*2*binary.MaxVarintLen32)) 387 | 388 | for k, v := range c.dict.H { 389 | if err := binary.Write(buf, binary.LittleEndian, k); err != nil { 390 | return nil, err 391 | } 392 | if err := binary.Write(buf, binary.LittleEndian, v); err != nil { 393 | return nil, err 394 | } 395 | } 396 | 397 | return buf.Bytes(), nil 398 | } 399 | 400 | return []byte{}, nil 401 | } 402 | 403 | // DeserializeDictionary reads the H part of the Dictionary from a 404 | // []byte previously created with SerializeDictionary 405 | func DeserializeDictionary(o []byte, m map[uint32]uint32) error { 406 | buf := bytes.NewBuffer(o) 407 | 408 | for buf.Len() > 0 { 409 | var k uint32 410 | 411 | if err := binary.Read(buf, binary.LittleEndian, &k); err != nil { 412 | return err 413 | } 414 | var v uint32 415 | if err := binary.Read(buf, binary.LittleEndian, &v); err != nil { 416 | return err 417 | } 418 | m[k] = v 419 | } 420 | 421 | return nil 422 | } 423 | 424 | // An Expander is the complete state of the expander returned by NewExpander 425 | type Expander struct { 426 | r io.Reader // The io.Reader from which the raw compressed data 427 | // is read 428 | d []byte // Data read from r is stored here so that 429 | // compressed back references can be handled 430 | to int // Position in d to which the caller has read 431 | dict []byte // Dictionary to decompress against, if set then 432 | // decompression is done referncing this. If not 433 | // then references are internal. 434 | } 435 | 436 | // NewExpander creates a new decompressor. Pass in an io.Reader that 437 | // can be used to read the raw compressed data. The Expander 438 | // implements io.Reader and so calling Read() decompress data and 439 | // reads the actual input. 440 | func NewExpander(r io.Reader, dict []byte) *Expander { 441 | e := Expander{} 442 | e.r = r 443 | e.to = 0 444 | e.dict = dict 445 | return &e 446 | } 447 | 448 | // readVarUint: since the compressed data consists of varints (see 449 | // bmcompress.go) for details then the fundamental operation is 450 | // reading varints 451 | func (e *Expander) readVarUint() (uint, error) { 452 | u := uint(0) 453 | b := make([]byte, 1) 454 | m := uint(1) 455 | for { 456 | if n, err := e.r.Read(b); n != 1 || err != nil { 457 | return 0, err 458 | } 459 | 460 | u += m * uint(b[0]&byte(0x7F)) 461 | m *= 128 462 | 463 | if b[0] < 128 { 464 | break 465 | } 466 | } 467 | 468 | return u, nil 469 | } 470 | 471 | // Expand expands the compressed data into a buffer 472 | func (e *Expander) Expand(p []byte) (q []byte, err error) { 473 | 474 | // This is done to capture the extreme case that an out of 475 | // bounds error occurs in the expansion. This should never 476 | // happen, but this protects against a corrupt compressed 477 | // block. If this occurs then we return no data at all to 478 | // prevent any bad data being returned to the client. 479 | 480 | defer func() { 481 | if x := recover(); x != nil { 482 | err = errors.New("panic caught inside expander") 483 | p = p[:0] 484 | } 485 | }() 486 | 487 | q = p 488 | A: 489 | for { 490 | var u uint 491 | if u, err = e.readVarUint(); err != nil { 492 | break A 493 | } 494 | 495 | // If the value read is zero then it indicates a compressed 496 | // section which is formed of two varints indicating the 497 | // offset and length, if not then it's an uncompressed section 498 | 499 | if u == 0 { 500 | var offset uint 501 | if offset, err = e.readVarUint(); err != nil { 502 | break A 503 | } 504 | 505 | var length uint 506 | if length, err = e.readVarUint(); err != nil { 507 | break A 508 | } 509 | 510 | q = append(q, e.dict[offset:offset+length]...) 511 | } else { 512 | left := u 513 | for left > 0 { 514 | buf := make([]byte, left) 515 | var n int 516 | if n, err = e.r.Read(buf); err != nil { 517 | break A 518 | } 519 | if n > 0 { 520 | left -= uint(n) 521 | q = append(q, buf[0:n]...) 522 | } else { 523 | break A 524 | } 525 | } 526 | } 527 | } 528 | 529 | if err == io.EOF { 530 | err = nil 531 | } 532 | 533 | return 534 | } 535 | --------------------------------------------------------------------------------