[pve-devel] [PATCH proxmox v2 01/11] add proxmox-oci crate
Wolfgang Bumiller
w.bumiller at proxmox.com
Tue Jun 24 14:42:50 CEST 2025
On Wed, Jun 11, 2025 at 04:48:53PM +0200, Filip Schauer wrote:
> This crate can parse and extract an OCI image bundled as a tar archive.
>
> Signed-off-by: Filip Schauer <f.schauer at proxmox.com>
> ---
> Cargo.toml | 1 +
> proxmox-oci/Cargo.toml | 22 ++++
> proxmox-oci/debian/changelog | 5 +
> proxmox-oci/debian/control | 47 ++++++++
> proxmox-oci/debian/debcargo.toml | 7 ++
> proxmox-oci/src/lib.rs | 196 +++++++++++++++++++++++++++++++
> proxmox-oci/src/oci_tar_image.rs | 167 ++++++++++++++++++++++++++
> 7 files changed, 445 insertions(+)
> create mode 100644 proxmox-oci/Cargo.toml
> create mode 100644 proxmox-oci/debian/changelog
> create mode 100644 proxmox-oci/debian/control
> create mode 100644 proxmox-oci/debian/debcargo.toml
> create mode 100644 proxmox-oci/src/lib.rs
> create mode 100644 proxmox-oci/src/oci_tar_image.rs
>
> diff --git a/Cargo.toml b/Cargo.toml
> index bf9e83d7..8365b18a 100644
> --- a/Cargo.toml
> +++ b/Cargo.toml
> @@ -26,6 +26,7 @@ members = [
> "proxmox-metrics",
> "proxmox-network-api",
> "proxmox-notify",
> + "proxmox-oci",
> "proxmox-openid",
> "proxmox-product-config",
> "proxmox-rest-server",
> diff --git a/proxmox-oci/Cargo.toml b/proxmox-oci/Cargo.toml
> new file mode 100644
> index 00000000..4daff6ab
> --- /dev/null
> +++ b/proxmox-oci/Cargo.toml
> @@ -0,0 +1,22 @@
> +[package]
> +name = "proxmox-oci"
> +description = "OCI image parsing and extraction"
> +version = "0.1.0"
> +
> +authors.workspace = true
> +edition.workspace = true
> +exclude.workspace = true
> +homepage.workspace = true
> +license.workspace = true
> +repository.workspace = true
> +rust-version.workspace = true
> +
> +[dependencies]
> +flate2.workspace = true
> +oci-spec = "0.8.1"
> +sha2 = "0.10"
> +tar.workspace = true
> +thiserror = "1"
> +zstd.workspace = true
> +
> +proxmox-io.workspace = true
> diff --git a/proxmox-oci/debian/changelog b/proxmox-oci/debian/changelog
> new file mode 100644
> index 00000000..754d06c1
> --- /dev/null
> +++ b/proxmox-oci/debian/changelog
> @@ -0,0 +1,5 @@
> +rust-proxmox-oci (0.1.0-1) bookworm; urgency=medium
> +
> + * Initial release.
> +
> + -- Proxmox Support Team <support at proxmox.com> Mon, 28 Apr 2025 12:34:56 +0200
> diff --git a/proxmox-oci/debian/control b/proxmox-oci/debian/control
> new file mode 100644
> index 00000000..3974cf48
> --- /dev/null
> +++ b/proxmox-oci/debian/control
> @@ -0,0 +1,47 @@
> +Source: rust-proxmox-oci
> +Section: rust
> +Priority: optional
> +Build-Depends: debhelper-compat (= 13),
> + dh-sequence-cargo
> +Build-Depends-Arch: cargo:native <!nocheck>,
> + rustc:native (>= 1.82) <!nocheck>,
> + libstd-rust-dev <!nocheck>,
> + librust-flate2-1+default-dev <!nocheck>,
> + librust-oci-spec-0.8+default-dev (>= 0.8.1-~~) <!nocheck>,
> + librust-proxmox-io-1+default-dev (>= 1.1.0-~~) <!nocheck>,
> + librust-sha2-0.10+default-dev <!nocheck>,
> + librust-tar-0.4+default-dev <!nocheck>,
> + librust-thiserror-1+default-dev <!nocheck>,
> + librust-zstd-0.12+bindgen-dev <!nocheck>,
> + librust-zstd-0.12+default-dev <!nocheck>
> +Maintainer: Proxmox Support Team <support at proxmox.com>
> +Standards-Version: 4.7.0
> +Vcs-Git: git://git.proxmox.com/git/proxmox.git
> +Vcs-Browser: https://git.proxmox.com/?p=proxmox.git
> +Homepage: https://proxmox.com
> +X-Cargo-Crate: proxmox-oci
> +Rules-Requires-Root: no
> +
> +Package: librust-proxmox-oci-dev
> +Architecture: any
> +Multi-Arch: same
> +Depends:
> + ${misc:Depends},
> + librust-flate2-1+default-dev,
> + librust-oci-spec-0.8+default-dev (>= 0.8.1-~~),
> + librust-proxmox-io-1+default-dev (>= 1.1.0-~~),
> + librust-sha2-0.10+default-dev,
> + librust-tar-0.4+default-dev,
> + librust-thiserror-1+default-dev,
> + librust-zstd-0.12+bindgen-dev,
> + librust-zstd-0.12+default-dev
> +Provides:
> + librust-proxmox-oci+default-dev (= ${binary:Version}),
> + librust-proxmox-oci-0-dev (= ${binary:Version}),
> + librust-proxmox-oci-0+default-dev (= ${binary:Version}),
> + librust-proxmox-oci-0.1-dev (= ${binary:Version}),
> + librust-proxmox-oci-0.1+default-dev (= ${binary:Version}),
> + librust-proxmox-oci-0.1.0-dev (= ${binary:Version}),
> + librust-proxmox-oci-0.1.0+default-dev (= ${binary:Version})
> +Description: OCI image parsing and extraction - Rust source code
> + Source code for Debianized Rust crate "proxmox-oci"
> diff --git a/proxmox-oci/debian/debcargo.toml b/proxmox-oci/debian/debcargo.toml
> new file mode 100644
> index 00000000..b7864cdb
> --- /dev/null
> +++ b/proxmox-oci/debian/debcargo.toml
> @@ -0,0 +1,7 @@
> +overlay = "."
> +crate_src_path = ".."
> +maintainer = "Proxmox Support Team <support at proxmox.com>"
> +
> +[source]
> +vcs_git = "git://git.proxmox.com/git/proxmox.git"
> +vcs_browser = "https://git.proxmox.com/?p=proxmox.git"
> diff --git a/proxmox-oci/src/lib.rs b/proxmox-oci/src/lib.rs
> new file mode 100644
> index 00000000..cc5a1d46
> --- /dev/null
> +++ b/proxmox-oci/src/lib.rs
> @@ -0,0 +1,196 @@
> +use flate2::read::GzDecoder;
> +use oci_spec::image::{Arch, Config, ImageConfiguration, ImageManifest, MediaType};
> +use oci_spec::OciSpecError;
> +use oci_tar_image::OciTarImage;
^ Please group this import with its `mod`.
> +use sha2::digest::generic_array::GenericArray;
> +use sha2::{Digest, Sha256};
> +use std::collections::HashMap;
> +use std::fs::File;
> +use std::io::{Read, Seek};
> +use std::path::PathBuf;
> +use std::str::FromStr;
> +use tar::Archive;
> +use thiserror::Error;
^ Please group imports. `std` first, the rest by "distance":
- std
- external
- ours
- `crate::...`
> +
> +pub mod oci_tar_image;
^ Does it make sense to make this public?
(Generally: avoid using `pub` and only add it where needed.)
> +
> +fn compute_digest<R: Read, H: Digest>(
> + mut reader: R,
> + mut hasher: H,
> +) -> GenericArray<u8, H::OutputSize> {
> + let mut buf = proxmox_io::boxed::zeroed(4096);
> +
> + loop {
> + let bytes_read = reader.read(&mut buf).unwrap();
^ This `unwrap()` is reachable through regular file I/O and potentially
compression layers AFAICT... We definitely need to propagate errors
here.
> + if bytes_read == 0 {
> + break hasher.finalize();
> + }
> +
> + hasher.update(&buf[..bytes_read]);
> + }
> +}
> +
> +fn compute_sha256<R: Read>(reader: R) -> oci_spec::image::Sha256Digest {
> + let digest = compute_digest(reader, Sha256::new());
> + oci_spec::image::Sha256Digest::from_str(&format!("{:x}", digest)).unwrap()
> +}
> +
> +/// Build a mapping from uncompressed layer digests (as found in the image config's `rootfs.diff_ids`)
> +/// to their corresponding compressed-layer digests (i.e. the filenames under `blobs/<algorithm>/<digest>`)
> +fn build_layer_map<R: Read + Seek>(
> + oci_tar_image: &mut OciTarImage<R>,
> + image_manifest: &ImageManifest,
> +) -> HashMap<oci_spec::image::Digest, oci_spec::image::Descriptor> {
> + let mut layer_mapping = HashMap::new();
> +
> + for layer in image_manifest.layers() {
> + let digest = match layer.media_type() {
> + MediaType::ImageLayer | MediaType::ImageLayerNonDistributable => {
> + Some(layer.digest().clone())
> + }
> + MediaType::ImageLayerGzip | MediaType::ImageLayerNonDistributableGzip => {
> + let compressed_blob = oci_tar_image.open_blob(layer.digest()).unwrap();
> + let decoder = GzDecoder::new(compressed_blob);
> + Some(compute_sha256(decoder).into())
> + }
> + MediaType::ImageLayerZstd | MediaType::ImageLayerNonDistributableZstd => {
> + let compressed_blob = oci_tar_image.open_blob(layer.digest()).unwrap();
> + let decoder = zstd::Decoder::new(compressed_blob).unwrap();
> + Some(compute_sha256(decoder).into())
> + }
> + _ => None,
This could just `continue` and the match can then drop the `Option` and
we can skip the `if let Some()` below.
Also:
Are other types not important? If so, why? (This should be documented.)
And do we know that this stays the case, or should we list the rest
explicitly, so that `oci-spec` crate updates force use to look at that?
> + };
> +
> + if let Some(digest) = digest {
> + layer_mapping.insert(digest, layer.clone());
> + }
> + }
> +
> + layer_mapping
> +}
> +
> +#[derive(Debug, Error)]
> +pub enum ProxmoxOciError {
> + #[error("Error while parsing OCI image: {0}")]
> + ParseError(#[from] ParseError),
> + #[error("Error while extracting OCI image: {0}")]
> + ExtractError(#[from] ExtractError),
> +}
> +
> +pub fn parse_and_extract_image(
> + oci_tar_path: &str,
> + rootfs_path: &str,
Use `&Path` or `P: AsRef<Path>` for these parameters.
> +) -> Result<Option<Config>, ProxmoxOciError> {
> + let (mut oci_tar_image, image_manifest, image_config) = parse_image(oci_tar_path)?;
> +
> + extract_image_rootfs(
> + &mut oci_tar_image,
> + &image_manifest,
> + &image_config,
> + rootfs_path.into(),
> + )?;
> +
> + Ok(image_config.config().clone())
> +}
> +
> +#[derive(Debug, Error)]
> +pub enum ParseError {
> + #[error("Not an OCI image: {0}")]
> + NotAnOciImage(OciSpecError),
^ I'm not convinced this is the right name, as in I don't think mapping
all `OciSpecError::SerDe` and `...::Builder` errors to "this is not an
OCI image" makes sense.
Just use `OciSpec(OciSpecError)`.
> + #[error("Wrong media type")]
> + WrongMediaType,
> + #[error("IO error: {0}")]
> + IoError(#[from] std::io::Error),
^ I'd drop the `Error` suffix since we're already in the
`Parse*Error*` type anyway ;-)
(The other variants don't have an Error suffix either, after all).
> + #[error("Unsupported CPU architecture")]
> + UnsupportedArchitecture,
> + #[error("Missing image config")]
> + MissingImageConfig,
> +}
> +
> +impl From<OciSpecError> for ParseError {
> + fn from(oci_spec_err: OciSpecError) -> Self {
> + match oci_spec_err {
> + OciSpecError::Io(ioerr) => Self::IoError(ioerr),
> + ocierr => Self::NotAnOciImage(ocierr),
^ If we decide that `NotAnOciImage` is not the correct name, I'd also
drop this `From` impl and just use `#[from]`, since then it would be
nice to have the IO error in the place where it happens I think...
> + }
> + }
> +}
> +
> +fn parse_image(
> + oci_tar_path: &str,
> +) -> Result<(OciTarImage<File>, ImageManifest, ImageConfiguration), ParseError> {
> + let oci_tar_file = File::open(oci_tar_path)?;
> + let mut oci_tar_image = OciTarImage::new(oci_tar_file)?;
> +
> + let image_manifest = oci_tar_image
> + .image_manifest(&Arch::Amd64)
> + .ok_or(ParseError::UnsupportedArchitecture)??;
> +
> + let image_config_descriptor = image_manifest.config();
> +
> + if image_config_descriptor.media_type() != &MediaType::ImageConfig {
> + return Err(ParseError::WrongMediaType);
> + }
> +
> + let image_config_file = oci_tar_image
> + .open_blob(image_config_descriptor.digest())
> + .ok_or(ParseError::MissingImageConfig)?;
> + let image_config = ImageConfiguration::from_reader(image_config_file)?;
> +
> + Ok((oci_tar_image, image_manifest, image_config))
> +}
> +
> +#[derive(Debug, Error)]
> +pub enum ExtractError {
> + #[error("Rootfs destination path not found")]
> + RootfsDestinationNotFound,
^ Is it really that helpful to have this case at all?
If the only entry point into this crate is via the
`parse_and_extract_image()` function, we can just document that the path
should be pre-created and not check this.
> + #[error("Unknown layer digest found in rootfs.diff_ids")]
> + UnknownLayerDigest,
> + #[error("Layer file mentioned in image manifest is missing")]
> + MissingLayerFile,
^ Could consider including the digest in the above 2 cases.
> + #[error("IO error: {0}")]
> + IoError(#[from] std::io::Error),
^ I'd drop the `Error` suffix since we're already in the
`Extract*Error*` type anyway ;-)
> + #[error("Layer has wrong media type")]
> + WrongMediaType,
^ Could consider including the textual version of the type.
> +}
> +
> +fn extract_image_rootfs<R: Read + Seek>(
> + oci_tar_image: &mut OciTarImage<R>,
> + image_manifest: &ImageManifest,
> + image_config: &ImageConfiguration,
> + target_path: PathBuf,
Make this &Path and drop the & in the unpack() call.
> +) -> Result<(), ExtractError> {
> + if !target_path.exists() {
> + return Err(ExtractError::RootfsDestinationNotFound);
> + }
^ As mentioned - not sure this is too helpful.
> +
> + let layer_map = build_layer_map(oci_tar_image, image_manifest);
> +
> + for layer in image_config.rootfs().diff_ids() {
> + let layer_digest = oci_spec::image::Digest::from_str(layer)
> + .map_err(|_| ExtractError::UnknownLayerDigest)?;
> + let layer_descriptor = layer_map
> + .get(&layer_digest)
> + .ok_or(ExtractError::UnknownLayerDigest)?;
> + let layer_file = oci_tar_image
> + .open_blob(layer_descriptor.digest())
> + .ok_or(ExtractError::MissingLayerFile)?;
> +
> + let tar_file: Box<dyn Read> = match layer_descriptor.media_type() {
> + MediaType::ImageLayer | MediaType::ImageLayerNonDistributable => Box::new(layer_file),
> + MediaType::ImageLayerGzip | MediaType::ImageLayerNonDistributableGzip => {
> + Box::new(GzDecoder::new(layer_file))
> + }
> + MediaType::ImageLayerZstd | MediaType::ImageLayerNonDistributableZstd => {
> + Box::new(zstd::Decoder::new(layer_file)?)
> + }
> + _ => return Err(ExtractError::WrongMediaType),
> + };
> +
> + let mut archive = Archive::new(tar_file);
> + archive.set_preserve_ownerships(true);
What about `.set_preserve_permissions()`, `.set_preserve_xattrs()`,
> + archive.unpack(&target_path)?;
> + }
> +
> + Ok(())
> +}
> diff --git a/proxmox-oci/src/oci_tar_image.rs b/proxmox-oci/src/oci_tar_image.rs
> new file mode 100644
> index 00000000..555558f5
> --- /dev/null
> +++ b/proxmox-oci/src/oci_tar_image.rs
> @@ -0,0 +1,167 @@
> +use oci_spec::image::{Arch, Digest, ImageIndex, ImageManifest, MediaType};
> +use oci_spec::OciSpecError;
> +use std::cmp::min;
> +use std::collections::HashMap;
> +use std::io::{Read, Seek, SeekFrom};
> +use std::path::PathBuf;
> +use tar::Archive;
^ Group these as well.
> +
> +#[derive(Clone)]
^ Could also be Copy
> +pub struct TarEntry {
> + offset: u64,
> + size: usize,
> +}
> +
> +pub struct TarEntryReader<'a, R: Read + Seek> {
^ This is quite generic. We could put this in `proxmox-io`:
But then I'd argue to drop the lifetime and take `R` directly, since
both `Read` and `Seek` are implemented for mutable references:
struct RangeReader<R: Read + Seek> {
reader: R,
/// Range inside `R`.
range: Range<u64>,
/// Relative position inside `range`.
position: u64,
/// True once we performed the initial seek.
ready: bool,
}
impl<R: Read + Seek> RangeReader<R> {
pub fn new(reader: R, range: Range<u64>) -> Self;
pub fn into_inner(self) -> R;
pub fn size(&self) -> usize;
pub fn remaining(&self) -> usize;
}
> + tar_entry: TarEntry,
> + reader: &'a mut R,
> + position: u64,
> +}
> +
> +impl<'a, R: Read + Seek> TarEntryReader<'a, R> {
> + pub fn new(tar_entry: TarEntry, reader: &'a mut R) -> Self {
^ As an *internal* helper (if not moved to proxmox-io), this `pub`
should be dropped.
> + Self {
> + tar_entry,
> + reader,
> + position: 0,
> + }
> + }
> +}
> +
> +impl<R: Read + Seek> Read for TarEntryReader<'_, R> {
> + fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
> + let max_read = min(buf.len(), self.tar_entry.size - self.position as usize);
^ For maintainability/readability, I'd prefer the `size-position` to be
factored out into an `fn remaining(&self) -> usize`.
> + let limited_buf = &mut buf[..max_read];
> + self.reader
> + .seek(SeekFrom::Start(self.tar_entry.offset + self.position))?;
^ We can reduce syscalls by doing this only once:
As long as this instance of `TarEntryReader` (or `RangeReader`) exist we
have exclusive access to `R`, since we own it (or it is a mutable
reference).
> + let bytes_read = self.reader.read(limited_buf)?;
> + self.position += bytes_read as u64;
^ If made generic in `proxmox-io`, this should use
`bytes_read.min(self.size())`, just in case `R` has a broken `read()`
implementation and returns a value larger than the buffer's size...
> +
> + Ok(bytes_read)
> + }
> +}
> +
> +impl<R: Read + Seek> Seek for TarEntryReader<'_, R> {
> + fn seek(&mut self, pos: SeekFrom) -> std::io::Result<u64> {
> + self.position = match pos {
> + SeekFrom::Start(position) => min(position, self.tar_entry.size as u64),
> + SeekFrom::End(offset) => {
> + if offset > self.tar_entry.size as i64 {
> + return Err(std::io::Error::new(
> + std::io::ErrorKind::NotSeekable,
^ NotSeekable -> InvalidInput, see `lseek(2)`/`fseek(3)`.
> + "Tried to seek before the beginning of the file",
> + ));
> + }
> +
> + (if offset <= 0 {
> + self.tar_entry.size
> + } else {
> + self.tar_entry.size - offset as usize
> + }) as u64
> + }
> + SeekFrom::Current(offset) => {
> + if (self.position as i64 + offset) < 0 {
^ Wraparound checks are better done with
`self.position.checked_add(offset)`, which will give you an `Err` if it
wraps.
But to be honest... I think we may as well disregard this and just use
`position.saturating_add(offset).min(size)`...
For "real" files, seeking past the end is not an error after all, and
it would simply put us at EOF.
> + return Err(std::io::Error::new(
> + std::io::ErrorKind::NotSeekable,
> + "Tried to seek before the beginning of the file",
> + ));
> + }
> +
> + min(self.position + offset as u64, self.tar_entry.size as u64)
> + }
> + };
> +
> + Ok(self.position)
> + }
> +}
> +
> +struct TarArchive<R: Read + Seek> {
> + reader: R,
> + entries: HashMap<PathBuf, TarEntry>,
> +}
> +
> +impl<R: Read + Seek> TarArchive<R> {
^ The struct is private, so all fns here can drop the `pub`.
> + pub fn new(reader: R) -> std::io::Result<Self> {
> + let mut archive = Archive::new(reader);
> + let entries = archive.entries_with_seek()?;
> + let mut entries_index = HashMap::new();
> +
> + for entry in entries {
> + let entry = entry?;
> + let offset = entry.raw_file_position();
> + let size = entry.size() as usize;
> + let path = entry.path()?.into_owned();
> + let tar_entry = TarEntry { offset, size };
> + entries_index.insert(path, tar_entry);
> + }
> +
> + Ok(Self {
> + reader: archive.into_inner(),
> + entries: entries_index,
> + })
> + }
> +
> + pub fn get_inner_file(&mut self, inner_file_path: PathBuf) -> Option<TarEntryReader<R>> {
^ We don't need ownership here, just use `&Path`.
Or, if you don't want to have to call `.as_ref()` at the call site, use
`P where P: AsRef<Path>`.
> + match self.entries.get(&inner_file_path) {
> + Some(tar_entry) => Some(TarEntryReader::new(tar_entry.clone(), &mut self.reader)),
> + None => None,
> + }
> + }
> +}
> +
> +pub struct OciTarImage<R: Read + Seek> {
> + archive: TarArchive<R>,
> + image_index: ImageIndex,
> +}
> +
> +impl<R: Read + Seek> OciTarImage<R> {
> + pub fn new(reader: R) -> oci_spec::Result<Self> {
> + let mut archive = TarArchive::new(reader)?;
> + let index_file = archive
> + .get_inner_file("index.json".into())
> + .ok_or(OciSpecError::Other("Missing index.json file".into()))?;
> + let image_index = ImageIndex::from_reader(index_file)?;
> +
> + Ok(Self {
> + archive,
> + image_index,
> + })
> + }
> +
> + pub fn image_index(&self) -> &ImageIndex {
> + &self.image_index
> + }
> +
> + pub fn open_blob(&mut self, digest: &Digest) -> Option<TarEntryReader<R>> {
> + self.archive.get_inner_file(get_blob_path(digest))
> + }
> +
> + pub fn image_manifest(
> + &mut self,
> + architecture: &Arch,
> + ) -> Option<oci_spec::Result<ImageManifest>> {
> + match self.image_index.manifests().iter().find(|&x| {
> + x.media_type() == &MediaType::ImageManifest
> + && x.platform()
> + .as_ref()
> + .is_none_or(|platform| platform.architecture() == architecture)
> + }) {
> + Some(descriptor) => match self.open_blob(&descriptor.digest().clone()) {
> + Some(image_manifest_file) => Some(ImageManifest::from_reader(image_manifest_file)),
> + None => Some(Err(OciSpecError::Other(
> + "Image manifest mentioned in image index is missing".into(),
Would be good to include the digest in the error.
> + ))),
> + },
> + None => None,
> + }
> + }
> +}
> +
> +fn get_blob_path(digest: &Digest) -> PathBuf {
Given that all elements here are regular strings, this could just do:
format!("blobs/{algorithm}/{digest}", algorithm = digest.algorithm()).into()
And when using `<P: AsRef<Path>>` above, you could return `String` and
drop the `.into()` at the end...
> + let algorithm = digest.algorithm().as_ref();
> + let digest = digest.digest();
> + let mut path = PathBuf::from("blobs");
> + path.push(algorithm);
> + path.push(digest);
> + path
> +}
> --
> 2.39.5
More information about the pve-devel
mailing list