extract: --skip-errors ignores corrupted chunks (w/ log message), see #…

…840 Forward port of a change implemented by @enkore back in 2016: enkore@09b21b1
borgbackup · Mar 28, 2023 · 33f823d · 33f823d
1 parent 80c08ab
commit 33f823d
Show file tree

Hide file tree

Showing 4 changed files with 72 additions and 14 deletions.
diff --git a/setup.cfg b/setup.cfg
@@ -126,6 +126,7 @@ per_file_ignores =
     src/borg/archiver/debug_cmd.py:F405
     src/borg/archiver/delete_cmd.py:F405
     src/borg/archiver/diff_cmd.py:F405
+    src/borg/archiver/extract_cmd.py:F405
     src/borg/archiver/help_cmd.py:E501,F405
     src/borg/archiver/key_cmds.py:F405
     src/borg/archiver/prune_cmd.py:F405

diff --git a/src/borg/archive.py b/src/borg/archive.py
@@ -791,6 +791,7 @@ def extract_item(
         stripped_components=0,
         original_path=None,
         pi=None,
+        skip_integrity_errors=False,
     ):
         """
         Extract archive item.
@@ -804,6 +805,8 @@ def extract_item(
         :param stripped_components: stripped leading path components to correct hard link extraction
         :param original_path: 'path' key as stored in archive
         :param pi: ProgressIndicatorPercent (or similar) for file extraction progress (in bytes)
+        :param skip_integrity_errors: skip over corrupted chunks instead of raising IntegrityError
+        (ignored for dry_run and stdout)
         """
         has_damaged_chunks = "chunks_healthy" in item
         if dry_run or stdout:
@@ -832,7 +835,7 @@ def extract_item(
                                 )
             if has_damaged_chunks:
                 raise BackupError("File has damaged (all-zero) chunks. Try running borg check --repair.")
-            return
+            return True
 
         original_path = original_path or item.path
         dest = self.cwd
@@ -867,15 +870,38 @@ def make_parent(path):
                     fd = open(path, "wb")
                 with fd:
                     ids = [c.id for c in item.chunks]
-                    for data in self.pipeline.fetch_many(ids, is_preloaded=True):
+                    chunk_index = -1
+                    chunk_iterator = self.pipeline.fetch_many(ids, is_preloaded=True)
+                    skipped_errors = False
+                    while True:
+                        try:
+                            chunk_index += 1
+                            data = next(chunk_iterator)
+                        except StopIteration:
+                            break
+                        except IntegrityError as err:
+                            if not skip_integrity_errors:
+                                raise
+                            c = item.chunks[chunk_index]
+                            size = c.size
+                            logger.warning("%s: chunk %s: %s", remove_surrogates(item.path), bin_to_hex(c.id), err)
+                            with backup_io("seek"):
+                                fd.seek(size, 1)
+                            skipped_errors = True
+                            # restart chunk data generator
+                            ids = [c.id for c in item.chunks[chunk_index + 1 :]]
+                            chunk_iterator = self.pipeline.fetch_many(ids, is_preloaded=True)
+                        else:
+                            with backup_io("write"):
+                                size = len(data)
+                                if sparse and zeros.startswith(data):
+                                    # all-zero chunk: create a hole in a sparse file
+                                    fd.seek(size, 1)
+                                else:
+                                    fd.write(data)
                         if pi:
-                            pi.show(increase=len(data), info=[remove_surrogates(item.path)])
-                        with backup_io("write"):
-                            if sparse and zeros.startswith(data):
-                                # all-zero chunk: create a hole in a sparse file
-                                fd.seek(len(data), 1)
-                            else:
-                                fd.write(data)
+                            pi.show(increase=size, info=[remove_surrogates(item.path)])
+
                     with backup_io("truncate_and_attrs"):
                         pos = item_chunks_size = fd.tell()
                         fd.truncate(pos)
@@ -889,7 +915,7 @@ def make_parent(path):
                         )
                 if has_damaged_chunks:
                     raise BackupError("File has damaged (all-zero) chunks. Try running borg check --repair.")
-            return
+            return not skipped_errors
         with backup_io:
             # No repository access beyond this point.
             if stat.S_ISDIR(mode):
@@ -914,18 +940,19 @@ def make_parent(path):
                 make_parent(path)
                 with self.extract_helper(item, path, hlm) as hardlink_set:
                     if hardlink_set:
-                        return
+                        return True
                     os.mkfifo(path)
                     self.restore_attrs(path, item)
             elif stat.S_ISCHR(mode) or stat.S_ISBLK(mode):
                 make_parent(path)
                 with self.extract_helper(item, path, hlm) as hardlink_set:
                     if hardlink_set:
-                        return
+                        return True
                     os.mknod(path, item.mode, item.rdev)
                     self.restore_attrs(path, item)
             else:
                 raise Exception("Unknown archive item type %r" % item.mode)
+            return True
 
     def restore_attrs(self, path, item, symlink=False, fd=None):
         """

diff --git a/src/borg/archiver/extract_cmd.py b/src/borg/archiver/extract_cmd.py
@@ -39,6 +39,7 @@ def do_extract(self, args, repository, manifest, archive):
         progress = args.progress
         output_list = args.output_list
         dry_run = args.dry_run
+        skip_errors = args.skip_errors
         stdout = args.stdout
         sparse = args.sparse
         strip_components = args.strip_components
@@ -75,15 +76,17 @@ def do_extract(self, args, repository, manifest, archive):
                         dirs.append(item)
                         archive.extract_item(item, stdout=stdout, restore_attrs=False)
                     else:
-                        archive.extract_item(
+                        if not archive.extract_item(
                             item,
                             stdout=stdout,
                             sparse=sparse,
                             hlm=hlm,
                             stripped_components=strip_components,
                             original_path=orig_path,
                             pi=pi,
-                        )
+                            skip_integrity_errors=skip_errors,
+                        ):
+                            self.exit_code = EXIT_WARNING
             except (BackupOSError, BackupError) as e:
                 self.print_warning("%s: %s", remove_surrogates(orig_path), e)
 
@@ -174,6 +177,13 @@ def build_parser_extract(self, subparsers, common_parser, mid_common_parser):
             action="store_true",
             help="create holes in output sparse file from all-zero chunks",
         )
+        subparser.add_argument(
+            "--skip-errors",
+            dest="skip_errors",
+            action="store_true",
+            help="skip corrupted chunks with a log message (exit 1) instead of aborting "
+            "(no effect for --dry-run and --stdout)",
+        )
         subparser.add_argument("name", metavar="NAME", type=archivename_validator, help="specify the archive name")
         subparser.add_argument(
             "paths", metavar="PATH", nargs="*", type=str, help="paths to extract; patterns are supported"

diff --git a/src/borg/testsuite/archiver/extract_cmd.py b/src/borg/testsuite/archiver/extract_cmd.py
@@ -585,6 +585,26 @@ def test_overwrite(self):
         with changedir("output"):
             self.cmd(f"--repo={self.repository_location}", "extract", "test", exit_code=1)
 
+    def test_extract_skip_errors(self):
+        self.create_regular_file("file1", contents=b"a" * 280 + b"b" * 280)
+        self.cmd(f"--repo={self.repository_location}", "rcreate", "-e" "none")
+        self.cmd(f"--repo={self.repository_location}", "create", "--chunker-params", "7,9,8,128", "test", "input")
+        segment_files = sorted(os.listdir(os.path.join(self.repository_path, "data", "0")), reverse=True)
+        print(
+            ", ".join(
+                f"{fn}: {os.stat(os.path.join(self.repository_path, 'data', '0', fn)).st_size}b" for fn in segment_files
+            )
+        )
+        name = segment_files[3]  # must be the segment file that has the file's chunks
+        with open(os.path.join(self.repository_path, "data", "0", name), "r+b") as fd:
+            fd.seek(100)
+            fd.write(b"XXXX")
+        with changedir("output"):
+            output = self.cmd(f"--repo={self.repository_location}", "extract", "--skip-errors", "test", exit_code=1)
+            assert "input/file1: chunk" in output
+            assert os.stat("input/file1").st_size == 560
+        self.cmd(f"--repo={self.repository_location}", "check", exit_code=1)
+
     # derived from test_extract_xattrs_errors()
     @pytest.mark.skipif(
         not xattr.XATTR_FAKEROOT, reason="xattr not supported on this system or on this version of fakeroot"