snappy.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434
  1. // Copyright 2019+ Klaus Post. All rights reserved.
  2. // License information can be found in the LICENSE file.
  3. // Based on work by Yann Collet, released under BSD License.
  4. package zstd
  5. import (
  6. "encoding/binary"
  7. "errors"
  8. "hash/crc32"
  9. "io"
  10. "github.com/klauspost/compress/huff0"
  11. snappy "github.com/klauspost/compress/internal/snapref"
  12. )
  13. const (
  14. snappyTagLiteral = 0x00
  15. snappyTagCopy1 = 0x01
  16. snappyTagCopy2 = 0x02
  17. snappyTagCopy4 = 0x03
  18. )
  19. const (
  20. snappyChecksumSize = 4
  21. snappyMagicBody = "sNaPpY"
  22. // snappyMaxBlockSize is the maximum size of the input to encodeBlock. It is not
  23. // part of the wire format per se, but some parts of the encoder assume
  24. // that an offset fits into a uint16.
  25. //
  26. // Also, for the framing format (Writer type instead of Encode function),
  27. // https://github.com/google/snappy/blob/master/framing_format.txt says
  28. // that "the uncompressed data in a chunk must be no longer than 65536
  29. // bytes".
  30. snappyMaxBlockSize = 65536
  31. // snappyMaxEncodedLenOfMaxBlockSize equals MaxEncodedLen(snappyMaxBlockSize), but is
  32. // hard coded to be a const instead of a variable, so that obufLen can also
  33. // be a const. Their equivalence is confirmed by
  34. // TestMaxEncodedLenOfMaxBlockSize.
  35. snappyMaxEncodedLenOfMaxBlockSize = 76490
  36. )
  37. const (
  38. chunkTypeCompressedData = 0x00
  39. chunkTypeUncompressedData = 0x01
  40. chunkTypePadding = 0xfe
  41. chunkTypeStreamIdentifier = 0xff
  42. )
  43. var (
  44. // ErrSnappyCorrupt reports that the input is invalid.
  45. ErrSnappyCorrupt = errors.New("snappy: corrupt input")
  46. // ErrSnappyTooLarge reports that the uncompressed length is too large.
  47. ErrSnappyTooLarge = errors.New("snappy: decoded block is too large")
  48. // ErrSnappyUnsupported reports that the input isn't supported.
  49. ErrSnappyUnsupported = errors.New("snappy: unsupported input")
  50. errUnsupportedLiteralLength = errors.New("snappy: unsupported literal length")
  51. )
  52. // SnappyConverter can read SnappyConverter-compressed streams and convert them to zstd.
  53. // Conversion is done by converting the stream directly from Snappy without intermediate
  54. // full decoding.
  55. // Therefore the compression ratio is much less than what can be done by a full decompression
  56. // and compression, and a faulty Snappy stream may lead to a faulty Zstandard stream without
  57. // any errors being generated.
  58. // No CRC value is being generated and not all CRC values of the Snappy stream are checked.
  59. // However, it provides really fast recompression of Snappy streams.
  60. // The converter can be reused to avoid allocations, even after errors.
  61. type SnappyConverter struct {
  62. r io.Reader
  63. err error
  64. buf []byte
  65. block *blockEnc
  66. }
  67. // Convert the Snappy stream supplied in 'in' and write the zStandard stream to 'w'.
  68. // If any error is detected on the Snappy stream it is returned.
  69. // The number of bytes written is returned.
  70. func (r *SnappyConverter) Convert(in io.Reader, w io.Writer) (int64, error) {
  71. initPredefined()
  72. r.err = nil
  73. r.r = in
  74. if r.block == nil {
  75. r.block = &blockEnc{}
  76. r.block.init()
  77. }
  78. r.block.initNewEncode()
  79. if len(r.buf) != snappyMaxEncodedLenOfMaxBlockSize+snappyChecksumSize {
  80. r.buf = make([]byte, snappyMaxEncodedLenOfMaxBlockSize+snappyChecksumSize)
  81. }
  82. r.block.litEnc.Reuse = huff0.ReusePolicyNone
  83. var written int64
  84. var readHeader bool
  85. {
  86. header := frameHeader{WindowSize: snappyMaxBlockSize}.appendTo(r.buf[:0])
  87. var n int
  88. n, r.err = w.Write(header)
  89. if r.err != nil {
  90. return written, r.err
  91. }
  92. written += int64(n)
  93. }
  94. for {
  95. if !r.readFull(r.buf[:4], true) {
  96. // Add empty last block
  97. r.block.reset(nil)
  98. r.block.last = true
  99. err := r.block.encodeLits(r.block.literals, false)
  100. if err != nil {
  101. return written, err
  102. }
  103. n, err := w.Write(r.block.output)
  104. if err != nil {
  105. return written, err
  106. }
  107. written += int64(n)
  108. return written, r.err
  109. }
  110. chunkType := r.buf[0]
  111. if !readHeader {
  112. if chunkType != chunkTypeStreamIdentifier {
  113. println("chunkType != chunkTypeStreamIdentifier", chunkType)
  114. r.err = ErrSnappyCorrupt
  115. return written, r.err
  116. }
  117. readHeader = true
  118. }
  119. chunkLen := int(r.buf[1]) | int(r.buf[2])<<8 | int(r.buf[3])<<16
  120. if chunkLen > len(r.buf) {
  121. println("chunkLen > len(r.buf)", chunkType)
  122. r.err = ErrSnappyUnsupported
  123. return written, r.err
  124. }
  125. // The chunk types are specified at
  126. // https://github.com/google/snappy/blob/master/framing_format.txt
  127. switch chunkType {
  128. case chunkTypeCompressedData:
  129. // Section 4.2. Compressed data (chunk type 0x00).
  130. if chunkLen < snappyChecksumSize {
  131. println("chunkLen < snappyChecksumSize", chunkLen, snappyChecksumSize)
  132. r.err = ErrSnappyCorrupt
  133. return written, r.err
  134. }
  135. buf := r.buf[:chunkLen]
  136. if !r.readFull(buf, false) {
  137. return written, r.err
  138. }
  139. //checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
  140. buf = buf[snappyChecksumSize:]
  141. n, hdr, err := snappyDecodedLen(buf)
  142. if err != nil {
  143. r.err = err
  144. return written, r.err
  145. }
  146. buf = buf[hdr:]
  147. if n > snappyMaxBlockSize {
  148. println("n > snappyMaxBlockSize", n, snappyMaxBlockSize)
  149. r.err = ErrSnappyCorrupt
  150. return written, r.err
  151. }
  152. r.block.reset(nil)
  153. r.block.pushOffsets()
  154. if err := decodeSnappy(r.block, buf); err != nil {
  155. r.err = err
  156. return written, r.err
  157. }
  158. if r.block.size+r.block.extraLits != n {
  159. printf("invalid size, want %d, got %d\n", n, r.block.size+r.block.extraLits)
  160. r.err = ErrSnappyCorrupt
  161. return written, r.err
  162. }
  163. err = r.block.encode(nil, false, false)
  164. switch err {
  165. case errIncompressible:
  166. r.block.popOffsets()
  167. r.block.reset(nil)
  168. r.block.literals, err = snappy.Decode(r.block.literals[:n], r.buf[snappyChecksumSize:chunkLen])
  169. if err != nil {
  170. return written, err
  171. }
  172. err = r.block.encodeLits(r.block.literals, false)
  173. if err != nil {
  174. return written, err
  175. }
  176. case nil:
  177. default:
  178. return written, err
  179. }
  180. n, r.err = w.Write(r.block.output)
  181. if r.err != nil {
  182. return written, err
  183. }
  184. written += int64(n)
  185. continue
  186. case chunkTypeUncompressedData:
  187. if debugEncoder {
  188. println("Uncompressed, chunklen", chunkLen)
  189. }
  190. // Section 4.3. Uncompressed data (chunk type 0x01).
  191. if chunkLen < snappyChecksumSize {
  192. println("chunkLen < snappyChecksumSize", chunkLen, snappyChecksumSize)
  193. r.err = ErrSnappyCorrupt
  194. return written, r.err
  195. }
  196. r.block.reset(nil)
  197. buf := r.buf[:snappyChecksumSize]
  198. if !r.readFull(buf, false) {
  199. return written, r.err
  200. }
  201. checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
  202. // Read directly into r.decoded instead of via r.buf.
  203. n := chunkLen - snappyChecksumSize
  204. if n > snappyMaxBlockSize {
  205. println("n > snappyMaxBlockSize", n, snappyMaxBlockSize)
  206. r.err = ErrSnappyCorrupt
  207. return written, r.err
  208. }
  209. r.block.literals = r.block.literals[:n]
  210. if !r.readFull(r.block.literals, false) {
  211. return written, r.err
  212. }
  213. if snappyCRC(r.block.literals) != checksum {
  214. println("literals crc mismatch")
  215. r.err = ErrSnappyCorrupt
  216. return written, r.err
  217. }
  218. err := r.block.encodeLits(r.block.literals, false)
  219. if err != nil {
  220. return written, err
  221. }
  222. n, r.err = w.Write(r.block.output)
  223. if r.err != nil {
  224. return written, err
  225. }
  226. written += int64(n)
  227. continue
  228. case chunkTypeStreamIdentifier:
  229. if debugEncoder {
  230. println("stream id", chunkLen, len(snappyMagicBody))
  231. }
  232. // Section 4.1. Stream identifier (chunk type 0xff).
  233. if chunkLen != len(snappyMagicBody) {
  234. println("chunkLen != len(snappyMagicBody)", chunkLen, len(snappyMagicBody))
  235. r.err = ErrSnappyCorrupt
  236. return written, r.err
  237. }
  238. if !r.readFull(r.buf[:len(snappyMagicBody)], false) {
  239. return written, r.err
  240. }
  241. for i := 0; i < len(snappyMagicBody); i++ {
  242. if r.buf[i] != snappyMagicBody[i] {
  243. println("r.buf[i] != snappyMagicBody[i]", r.buf[i], snappyMagicBody[i], i)
  244. r.err = ErrSnappyCorrupt
  245. return written, r.err
  246. }
  247. }
  248. continue
  249. }
  250. if chunkType <= 0x7f {
  251. // Section 4.5. Reserved unskippable chunks (chunk types 0x02-0x7f).
  252. println("chunkType <= 0x7f")
  253. r.err = ErrSnappyUnsupported
  254. return written, r.err
  255. }
  256. // Section 4.4 Padding (chunk type 0xfe).
  257. // Section 4.6. Reserved skippable chunks (chunk types 0x80-0xfd).
  258. if !r.readFull(r.buf[:chunkLen], false) {
  259. return written, r.err
  260. }
  261. }
  262. }
  263. // decodeSnappy writes the decoding of src to dst. It assumes that the varint-encoded
  264. // length of the decompressed bytes has already been read.
  265. func decodeSnappy(blk *blockEnc, src []byte) error {
  266. //decodeRef(make([]byte, snappyMaxBlockSize), src)
  267. var s, length int
  268. lits := blk.extraLits
  269. var offset uint32
  270. for s < len(src) {
  271. switch src[s] & 0x03 {
  272. case snappyTagLiteral:
  273. x := uint32(src[s] >> 2)
  274. switch {
  275. case x < 60:
  276. s++
  277. case x == 60:
  278. s += 2
  279. if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
  280. println("uint(s) > uint(len(src)", s, src)
  281. return ErrSnappyCorrupt
  282. }
  283. x = uint32(src[s-1])
  284. case x == 61:
  285. s += 3
  286. if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
  287. println("uint(s) > uint(len(src)", s, src)
  288. return ErrSnappyCorrupt
  289. }
  290. x = uint32(src[s-2]) | uint32(src[s-1])<<8
  291. case x == 62:
  292. s += 4
  293. if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
  294. println("uint(s) > uint(len(src)", s, src)
  295. return ErrSnappyCorrupt
  296. }
  297. x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
  298. case x == 63:
  299. s += 5
  300. if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
  301. println("uint(s) > uint(len(src)", s, src)
  302. return ErrSnappyCorrupt
  303. }
  304. x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
  305. }
  306. if x > snappyMaxBlockSize {
  307. println("x > snappyMaxBlockSize", x, snappyMaxBlockSize)
  308. return ErrSnappyCorrupt
  309. }
  310. length = int(x) + 1
  311. if length <= 0 {
  312. println("length <= 0 ", length)
  313. return errUnsupportedLiteralLength
  314. }
  315. //if length > snappyMaxBlockSize-d || uint32(length) > len(src)-s {
  316. // return ErrSnappyCorrupt
  317. //}
  318. blk.literals = append(blk.literals, src[s:s+length]...)
  319. //println(length, "litLen")
  320. lits += length
  321. s += length
  322. continue
  323. case snappyTagCopy1:
  324. s += 2
  325. if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
  326. println("uint(s) > uint(len(src)", s, len(src))
  327. return ErrSnappyCorrupt
  328. }
  329. length = 4 + int(src[s-2])>>2&0x7
  330. offset = uint32(src[s-2])&0xe0<<3 | uint32(src[s-1])
  331. case snappyTagCopy2:
  332. s += 3
  333. if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
  334. println("uint(s) > uint(len(src)", s, len(src))
  335. return ErrSnappyCorrupt
  336. }
  337. length = 1 + int(src[s-3])>>2
  338. offset = uint32(src[s-2]) | uint32(src[s-1])<<8
  339. case snappyTagCopy4:
  340. s += 5
  341. if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
  342. println("uint(s) > uint(len(src)", s, len(src))
  343. return ErrSnappyCorrupt
  344. }
  345. length = 1 + int(src[s-5])>>2
  346. offset = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
  347. }
  348. if offset <= 0 || blk.size+lits < int(offset) /*|| length > len(blk)-d */ {
  349. println("offset <= 0 || blk.size+lits < int(offset)", offset, blk.size+lits, int(offset), blk.size, lits)
  350. return ErrSnappyCorrupt
  351. }
  352. // Check if offset is one of the recent offsets.
  353. // Adjusts the output offset accordingly.
  354. // Gives a tiny bit of compression, typically around 1%.
  355. if false {
  356. offset = blk.matchOffset(offset, uint32(lits))
  357. } else {
  358. offset += 3
  359. }
  360. blk.sequences = append(blk.sequences, seq{
  361. litLen: uint32(lits),
  362. offset: offset,
  363. matchLen: uint32(length) - zstdMinMatch,
  364. })
  365. blk.size += length + lits
  366. lits = 0
  367. }
  368. blk.extraLits = lits
  369. return nil
  370. }
  371. func (r *SnappyConverter) readFull(p []byte, allowEOF bool) (ok bool) {
  372. if _, r.err = io.ReadFull(r.r, p); r.err != nil {
  373. if r.err == io.ErrUnexpectedEOF || (r.err == io.EOF && !allowEOF) {
  374. r.err = ErrSnappyCorrupt
  375. }
  376. return false
  377. }
  378. return true
  379. }
  380. var crcTable = crc32.MakeTable(crc32.Castagnoli)
  381. // crc implements the checksum specified in section 3 of
  382. // https://github.com/google/snappy/blob/master/framing_format.txt
  383. func snappyCRC(b []byte) uint32 {
  384. c := crc32.Update(0, crcTable, b)
  385. return c>>15 | c<<17 + 0xa282ead8
  386. }
  387. // snappyDecodedLen returns the length of the decoded block and the number of bytes
  388. // that the length header occupied.
  389. func snappyDecodedLen(src []byte) (blockLen, headerLen int, err error) {
  390. v, n := binary.Uvarint(src)
  391. if n <= 0 || v > 0xffffffff {
  392. return 0, 0, ErrSnappyCorrupt
  393. }
  394. const wordSize = 32 << (^uint(0) >> 32 & 1)
  395. if wordSize == 32 && v > 0x7fffffff {
  396. return 0, 0, ErrSnappyTooLarge
  397. }
  398. return int(v), n, nil
  399. }