Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Expose hash over ChunkListEntry to borg list --format {chunk_ids_$HASH} #5167

Draft
wants to merge 6 commits into
base: master
Choose a base branch
from
27 changes: 24 additions & 3 deletions src/borg/helpers/parseformat.py
Expand Up @@ -688,6 +688,7 @@ class ItemFormatter(BaseFormatter):
('size', 'csize', 'dsize', 'dcsize', 'num_chunks', 'unique_chunks'),
('mtime', 'ctime', 'atime', 'isomtime', 'isoctime', 'isoatime'),
tuple(sorted(hash_algorithms)),
tuple(['chunker_params', 'chunk_ids_checksum']),
('archiveid', 'archivename', 'extra'),
('health', )
)
Expand Down Expand Up @@ -754,6 +755,8 @@ def __init__(self, archive, format, *, json_lines=False):
'csize': self.calculate_csize,
'dsize': partial(self.sum_unique_chunks_metadata, lambda chunk: chunk.size),
'dcsize': partial(self.sum_unique_chunks_metadata, lambda chunk: chunk.csize),
'chunker_params': self.hash_chunker_params,
'chunk_ids_checksum': self.hash_chunks,
'num_chunks': self.calculate_num_chunks,
'unique_chunks': partial(self.sum_unique_chunks_metadata, lambda chunk: 1),
'isomtime': partial(self.format_iso_time, 'mtime'),
Expand Down Expand Up @@ -834,13 +837,31 @@ def calculate_csize(self, item):
# note: does not support hardlink slaves, they will be csize 0
return item.get_size(compressed=True)

def hash_item(self, hash_function, item):
if 'chunks' not in item:
return ""
def prepare_hash_function(self, hash_function):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This does not "prepare" the hash function in any way, it just picks one by its name, which is what one passes into it, rather than a function, thus naming this something like get_hash_function(self, name) would be less misleading and more self-explanatory.

if hash_function in hashlib.algorithms_guaranteed:
hash = hashlib.new(hash_function)
elif hash_function == 'xxh64':
hash = self.xxh64()
return hash

def hash_chunker_params(self, item):
chunker_params = self.archive.metadata.get('chunker_params')
return '-'.join(map(repr, chunker_params))

def hash_chunks(self, item):
if 'chunks' not in item:
return ""
hash_function = 'sha256'
assert hash_function in hashlib.algorithms_guaranteed, hashlib.algorithms_guaranteed
hash = hashlib.new(hash_function)
for chunk in item.chunks:
hash.update(chunk.id)
return hash.hexdigest()

def hash_item(self, hash_function, item):
if 'chunks' not in item:
return ""
hash = self.prepare_hash_function(hash_function)
for data in self.archive.pipeline.fetch_many([c.id for c in item.chunks]):
hash.update(data)
return hash.hexdigest()
Expand Down