-
Notifications
You must be signed in to change notification settings - Fork 261
Fix DiffID computation to use uncompressed layer digest #587
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
fc269d0
caaab25
e673719
7dfaef6
55a16d7
50b1194
13e26ba
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -14,10 +14,12 @@ | |
| // limitations under the License. | ||
| //===----------------------------------------------------------------------===// | ||
|
|
||
| import Compression | ||
| import ContainerizationError | ||
| import Crypto | ||
| import Foundation | ||
| import NIOCore | ||
| import zlib | ||
|
|
||
| /// Provides a context to write data into a directory. | ||
| public class ContentWriter { | ||
|
|
@@ -60,6 +62,135 @@ public class ContentWriter { | |
| return try self.write(data) | ||
| } | ||
|
|
||
| /// Computes the SHA256 digest of the uncompressed content of a gzip file. | ||
| /// | ||
| /// Per the OCI Image Specification, a DiffID is the SHA256 digest of the | ||
| /// uncompressed layer content. This method loads the compressed file, | ||
| /// decompresses it, validates the gzip trailer, and hashes the result. | ||
| /// | ||
| /// - Parameter url: The URL of the gzip-compressed file. | ||
| /// - Returns: The SHA256 digest of the uncompressed content. | ||
| public static func diffID(of url: URL) throws -> SHA256.Digest { | ||
| let compressedData = try Data(contentsOf: url) | ||
| let decompressed = try Self.decompressGzip(compressedData) | ||
| return SHA256.hash(data: decompressed) | ||
| } | ||
|
|
||
| /// Decompresses gzip data by stripping the gzip header and feeding the raw | ||
| /// deflate stream to Apple's Compression framework. | ||
| private static func decompressGzip(_ data: Data) throws -> Data { | ||
| let headerSize = try Self.gzipHeaderSize(data) | ||
|
|
||
| var output = Data() | ||
| let bufferSize = 65_536 | ||
| let destinationBuffer = UnsafeMutablePointer<UInt8>.allocate(capacity: bufferSize) | ||
| defer { destinationBuffer.deallocate() } | ||
|
|
||
| try data.withUnsafeBytes { rawBuffer in | ||
| guard let sourcePointer = rawBuffer.baseAddress?.assumingMemoryBound(to: UInt8.self) else { | ||
| throw ContentWriterError.decompressionFailed | ||
| } | ||
|
|
||
| let stream = UnsafeMutablePointer<compression_stream>.allocate(capacity: 1) | ||
| defer { stream.deallocate() } | ||
|
|
||
| var status = compression_stream_init(stream, COMPRESSION_STREAM_DECODE, COMPRESSION_ZLIB) | ||
| guard status != COMPRESSION_STATUS_ERROR else { | ||
| throw ContentWriterError.decompressionFailed | ||
| } | ||
| defer { compression_stream_destroy(stream) } | ||
|
|
||
| stream.pointee.src_ptr = sourcePointer.advanced(by: headerSize) | ||
| stream.pointee.src_size = data.count - headerSize | ||
| stream.pointee.dst_ptr = destinationBuffer | ||
| stream.pointee.dst_size = bufferSize | ||
|
|
||
| repeat { | ||
| status = compression_stream_process(stream, 0) | ||
|
|
||
| switch status { | ||
| case COMPRESSION_STATUS_OK: | ||
| let produced = bufferSize - stream.pointee.dst_size | ||
| output.append(destinationBuffer, count: produced) | ||
| stream.pointee.dst_ptr = destinationBuffer | ||
| stream.pointee.dst_size = bufferSize | ||
|
|
||
| case COMPRESSION_STATUS_END: | ||
| let produced = bufferSize - stream.pointee.dst_size | ||
| if produced > 0 { | ||
| output.append(destinationBuffer, count: produced) | ||
| } | ||
|
|
||
| default: | ||
| throw ContentWriterError.decompressionFailed | ||
| } | ||
| } while status == COMPRESSION_STATUS_OK | ||
| } | ||
|
|
||
| // Validate the gzip trailer: last 8 bytes are CRC32 + ISIZE (both little-endian). | ||
| guard data.count >= 8 else { | ||
| throw ContentWriterError.gzipTrailerMismatch | ||
| } | ||
| let trailerStart = data.startIndex + data.count - 8 | ||
| let expectedCRC = UInt32(data[trailerStart]) | ||
| | (UInt32(data[trailerStart + 1]) << 8) | ||
| | (UInt32(data[trailerStart + 2]) << 16) | ||
| | (UInt32(data[trailerStart + 3]) << 24) | ||
| let expectedSize = UInt32(data[trailerStart + 4]) | ||
| | (UInt32(data[trailerStart + 5]) << 8) | ||
| | (UInt32(data[trailerStart + 6]) << 16) | ||
| | (UInt32(data[trailerStart + 7]) << 24) | ||
|
|
||
| let actualCRC = output.withUnsafeBytes { buffer -> UInt32 in | ||
| let ptr = buffer.baseAddress!.assumingMemoryBound(to: Bytef.self) | ||
| return UInt32(crc32(0, ptr, uInt(buffer.count))) | ||
| } | ||
| let actualSize = UInt32(truncatingIfNeeded: output.count) | ||
|
|
||
| guard expectedCRC == actualCRC, expectedSize == actualSize else { | ||
| throw ContentWriterError.gzipTrailerMismatch | ||
| } | ||
|
|
||
| return output | ||
| } | ||
|
|
||
| /// Parses the gzip header to determine where the raw deflate stream begins. | ||
| private static func gzipHeaderSize(_ data: Data) throws -> Int { | ||
| guard data.count >= 10, | ||
| data[data.startIndex] == 0x1f, | ||
| data[data.startIndex + 1] == 0x8b, | ||
| data[data.startIndex + 2] == 0x08 // CM must be 8 (deflate) per RFC 1952 | ||
| else { | ||
| throw ContentWriterError.invalidGzip | ||
| } | ||
|
|
||
| let start = data.startIndex | ||
| let flags = data[start + 3] | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Question: what's the reason the current changes skipped compression method (CM) (ref https://datatracker.ietf.org/doc/html/rfc1952#page-5) entirely.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good catch — the header parser was only checking the magic bytes (1f 8b) but not the compression method. I've added a guard for CM == 0x08 (deflate), which is the only method defined by RFC 1952. Anything else will now throw invalidGzip. |
||
| var offset = 10 | ||
|
|
||
| // FEXTRA | ||
| if flags & 0x04 != 0 { | ||
| guard data.count >= offset + 2 else { throw ContentWriterError.invalidGzip } | ||
| let extraLen = Int(data[start + offset]) | (Int(data[start + offset + 1]) << 8) | ||
| offset += 2 + extraLen | ||
| } | ||
| // FNAME | ||
| if flags & 0x08 != 0 { | ||
| while offset < data.count && data[start + offset] != 0 { offset += 1 } | ||
| offset += 1 | ||
| } | ||
| // FCOMMENT | ||
| if flags & 0x10 != 0 { | ||
| while offset < data.count && data[start + offset] != 0 { offset += 1 } | ||
| offset += 1 | ||
| } | ||
| // FHCRC | ||
| if flags & 0x02 != 0 { offset += 2 } | ||
|
|
||
| guard offset < data.count else { throw ContentWriterError.invalidGzip } | ||
| return offset | ||
| } | ||
|
|
||
| /// Encodes the passed in type as a JSON blob and writes it to the base path. | ||
| /// - Parameters: | ||
| /// - content: The type to convert to JSON. | ||
|
|
@@ -69,3 +200,9 @@ public class ContentWriter { | |
| return try self.write(data) | ||
| } | ||
| } | ||
|
|
||
| enum ContentWriterError: Error { | ||
| case invalidGzip | ||
| case decompressionFailed | ||
| case gzipTrailerMismatch | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,159 @@ | ||
| //===----------------------------------------------------------------------===// | ||
| // Copyright © 2025-2026 Apple Inc. and the Containerization project authors. | ||
| // | ||
| // Licensed under the Apache License, Version 2.0 (the "License"); | ||
| // you may not use this file except in compliance with the License. | ||
| // You may obtain a copy of the License at | ||
| // | ||
| // https://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, software | ||
| // distributed under the License is distributed on an "AS IS" BASIS, | ||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| // See the License for the specific language governing permissions and | ||
| // limitations under the License. | ||
| //===----------------------------------------------------------------------===// | ||
|
|
||
| import Crypto | ||
| import Foundation | ||
| import Testing | ||
|
|
||
| @testable import ContainerizationOCI | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Question: I think we may need a test to validate gzip trailer to ensure it does not return a digest for malformed data.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good shout — the implementation wasn't validating the gzip trailer at all. I've added CRC32 + ISIZE verification after decompression (throws gzipTrailerMismatch on failure) and two new tests: one for a truncated archive with the trailer chopped off, and one for a corrupted CRC32 field. |
||
|
|
||
| struct DiffIDTests { | ||
| /// Helper to create a gzip-compressed temporary file from raw data. | ||
| private func createGzipFile(content: Data) throws -> URL { | ||
| let tempDir = FileManager.default.temporaryDirectory | ||
| let rawFile = tempDir.appendingPathComponent(UUID().uuidString) | ||
| let gzFile = tempDir.appendingPathComponent(UUID().uuidString + ".gz") | ||
| try content.write(to: rawFile) | ||
| defer { try? FileManager.default.removeItem(at: rawFile) } | ||
|
|
||
| let process = Process() | ||
| process.executableURL = URL(fileURLWithPath: "/usr/bin/gzip") | ||
| process.arguments = ["-k", "-f", rawFile.path] | ||
| try process.run() | ||
| process.waitUntilExit() | ||
|
|
||
| let gzPath = URL(fileURLWithPath: rawFile.path + ".gz") | ||
| if FileManager.default.fileExists(atPath: gzPath.path) { | ||
| try FileManager.default.moveItem(at: gzPath, to: gzFile) | ||
| } | ||
| return gzFile | ||
| } | ||
|
|
||
| @Test func diffIDMatchesUncompressedSHA256() throws { | ||
| let content = Data("hello, oci layer content for diffid test".utf8) | ||
| let gzFile = try createGzipFile(content: content) | ||
| defer { try? FileManager.default.removeItem(at: gzFile) } | ||
|
|
||
| let diffID = try ContentWriter.diffID(of: gzFile) | ||
| let expected = SHA256.hash(data: content) | ||
|
|
||
| #expect(diffID.digestString == expected.digestString) | ||
| } | ||
|
|
||
| @Test func diffIDIsDeterministic() throws { | ||
| let content = Data("deterministic diffid check".utf8) | ||
| let gzFile = try createGzipFile(content: content) | ||
| defer { try? FileManager.default.removeItem(at: gzFile) } | ||
|
|
||
| let first = try ContentWriter.diffID(of: gzFile) | ||
| let second = try ContentWriter.diffID(of: gzFile) | ||
|
|
||
| #expect(first.digestString == second.digestString) | ||
| } | ||
|
|
||
| @Test func diffIDRejectsNonGzipData() throws { | ||
| let tempFile = FileManager.default.temporaryDirectory.appendingPathComponent(UUID().uuidString) | ||
| try Data("this is not gzip".utf8).write(to: tempFile) | ||
| defer { try? FileManager.default.removeItem(at: tempFile) } | ||
|
|
||
| #expect(throws: ContentWriterError.self) { | ||
| try ContentWriter.diffID(of: tempFile) | ||
| } | ||
| } | ||
|
|
||
| @Test func diffIDRejectsEmptyFile() throws { | ||
| let tempFile = FileManager.default.temporaryDirectory.appendingPathComponent(UUID().uuidString) | ||
| try Data().write(to: tempFile) | ||
| defer { try? FileManager.default.removeItem(at: tempFile) } | ||
|
|
||
| #expect(throws: ContentWriterError.self) { | ||
| try ContentWriter.diffID(of: tempFile) | ||
| } | ||
| } | ||
|
|
||
| @Test func diffIDHandlesLargeContent() throws { | ||
| // 1MB of repeating data | ||
| let pattern = Data("ABCDEFGHIJKLMNOPQRSTUVWXYZ012345".utf8) | ||
| var large = Data() | ||
| for _ in 0..<(1_048_576 / pattern.count) { | ||
| large.append(pattern) | ||
| } | ||
| let gzFile = try createGzipFile(content: large) | ||
| defer { try? FileManager.default.removeItem(at: gzFile) } | ||
|
|
||
| let diffID = try ContentWriter.diffID(of: gzFile) | ||
| let expected = SHA256.hash(data: large) | ||
|
|
||
| #expect(diffID.digestString == expected.digestString) | ||
| } | ||
|
|
||
| @Test func diffIDRejectsTruncatedGzip() throws { | ||
| // Build a valid gzip file, then chop off the 8-byte trailer (CRC32 + ISIZE) | ||
| // to produce a structurally malformed archive. | ||
| let content = Data("truncated gzip trailer test".utf8) | ||
| let gzFile = try createGzipFile(content: content) | ||
| defer { try? FileManager.default.removeItem(at: gzFile) } | ||
|
|
||
| var gzData = try Data(contentsOf: gzFile) | ||
| guard gzData.count > 8 else { | ||
| Issue.record("Compressed file too small to truncate") | ||
| return | ||
| } | ||
| gzData.removeLast(8) | ||
|
|
||
| let truncatedFile = FileManager.default.temporaryDirectory | ||
| .appendingPathComponent(UUID().uuidString + ".gz") | ||
| try gzData.write(to: truncatedFile) | ||
| defer { try? FileManager.default.removeItem(at: truncatedFile) } | ||
|
|
||
| #expect(throws: ContentWriterError.self) { | ||
| try ContentWriter.diffID(of: truncatedFile) | ||
| } | ||
| } | ||
|
|
||
| @Test func diffIDRejectsCorruptedCRC() throws { | ||
| // Flip a byte in the CRC32 field of an otherwise valid gzip file. | ||
| let content = Data("corrupted crc test".utf8) | ||
| let gzFile = try createGzipFile(content: content) | ||
| defer { try? FileManager.default.removeItem(at: gzFile) } | ||
|
|
||
| var gzData = try Data(contentsOf: gzFile) | ||
| let crcOffset = gzData.count - 8 | ||
| gzData[crcOffset] ^= 0xFF | ||
|
|
||
| let corruptedFile = FileManager.default.temporaryDirectory | ||
| .appendingPathComponent(UUID().uuidString + ".gz") | ||
| try gzData.write(to: corruptedFile) | ||
| defer { try? FileManager.default.removeItem(at: corruptedFile) } | ||
|
|
||
| #expect(throws: ContentWriterError.self) { | ||
| try ContentWriter.diffID(of: corruptedFile) | ||
| } | ||
| } | ||
|
|
||
| @Test func diffIDDigestStringFormat() throws { | ||
| let content = Data("format test".utf8) | ||
| let gzFile = try createGzipFile(content: content) | ||
| defer { try? FileManager.default.removeItem(at: gzFile) } | ||
|
|
||
| let diffID = try ContentWriter.diffID(of: gzFile) | ||
| let digestString = diffID.digestString | ||
|
|
||
| #expect(digestString.hasPrefix("sha256:")) | ||
| // sha256: prefix + 64 hex chars | ||
| #expect(digestString.count == 7 + 64) | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Question: If I'm reading this right
Data(contentsOf: url)will load the entire compressed file into memory and then fed intodecompressGzipwhich is not same as "using a streaming approach for memory efficiency" per the comment?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fair point — the docstring was misleading. The decompression itself does use a streaming buffer internally (64 KB chunks through compression_stream_process), but the initial load and the final hash are not streamed. I've corrected the comment to say what it actually does.