Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Expose hash over ChunkListEntry to borg list --format {chunk_ids_$HASH} #5167

Draft
wants to merge 6 commits into
base: master
Choose a base branch
from
32 changes: 29 additions & 3 deletions src/borg/helpers/parseformat.py
Expand Up @@ -688,6 +688,8 @@ class ItemFormatter(BaseFormatter):
('size', 'csize', 'dsize', 'dcsize', 'num_chunks', 'unique_chunks'),
('mtime', 'ctime', 'atime', 'isomtime', 'isoctime', 'isoatime'),
tuple(sorted(hash_algorithms)),
tuple(['chunk_ids_%s' % alg for alg in sorted(hash_algorithms)] + [
'chunker_params_%s' % alg for alg in sorted(hash_algorithms)]),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the chunker params are same within 1 archive, so guess you do not want to show them for each file?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm, true, but this would make parsing the output of list harder. As it is now, you can just compare (item['chunk_ids_checksum'], item['chunker_params']) after parsing the output for each item.

('archiveid', 'archivename', 'extra'),
('health', )
)
Expand Down Expand Up @@ -765,6 +767,8 @@ def __init__(self, archive, format, *, json_lines=False):
}
for hash_function in self.hash_algorithms:
self.add_key(hash_function, partial(self.hash_item, hash_function))
self.call_keys['chunk_ids_%s' % hash_function] = partial(self.hash_chunks, hash_function)
self.call_keys['chunker_params_%s' % hash_function] = partial(self.hash_chunker_params, hash_function)
self.used_call_keys = set(self.call_keys) & self.format_keys

def format_item_json(self, item):
Expand Down Expand Up @@ -834,13 +838,35 @@ def calculate_csize(self, item):
# note: does not support hardlink slaves, they will be csize 0
return item.get_size(compressed=True)

def hash_item(self, hash_function, item):
if 'chunks' not in item:
return ""
def prepare_hash_function(self, hash_function):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This does not "prepare" the hash function in any way, it just picks one by its name, which is what one passes into it, rather than a function, thus naming this something like get_hash_function(self, name) would be less misleading and more self-explanatory.

if hash_function in hashlib.algorithms_guaranteed:
hash = hashlib.new(hash_function)
elif hash_function == 'xxh64':
hash = self.xxh64()
return hash

def hash_chunker_params(self, hash_function, item):
hash = self.prepare_hash_function(hash_function)

chunker_params = self.archive.metadata.get('chunker_params')
for info in chunker_params:
hash.update(bytes(info))
return hash.hexdigest()

hrehfeld marked this conversation as resolved.
Show resolved Hide resolved
def hash_chunks(self, hash_function, item):
if 'chunks' not in item:
return ""
hash = self.prepare_hash_function(hash_function)
for chunk in item.chunks:
hash.update(chunk.id)
hash.update(bytes(chunk.size))
hash.update(bytes(chunk.csize))
hrehfeld marked this conversation as resolved.
Show resolved Hide resolved
return hash.hexdigest()

def hash_item(self, hash_function, item):
if 'chunks' not in item:
return ""
hash = self.prepare_hash_function(hash_function)
for data in self.archive.pipeline.fetch_many([c.id for c in item.chunks]):
hash.update(data)
return hash.hexdigest()
Expand Down