You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
2024-02-14 21:01 INFO 2048692:root - Downloaded https://dl.fbaipublicfiles.com/laser/CCMatrix/v1.0.0/2020-10_0278.tsv.gz [200] took 8s (5766.4kB/s)
2024-02-14 21:01 INFO 2048692:root - Starting download of https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2020-10/segments/1581875145708.59/wet/CC-MAIN-20200222150029-20200222180029-00542.warc.wet.gz
2024-02-14 21:01 INFO 2048693:root - Downloaded https://dl.fbaipublicfiles.com/laser/CCMatrix/v1.0.0/2018-05_0044.tsv.gz [200] took 9s (5267.6kB/s)
2024-02-14 21:01 INFO 2048693:root - Starting download of https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2018-05/segments/1516084886639.11/wet/CC-MAIN-20180116184540-20180116204540-00601.warc.wet.gz
multiprocessing.pool.RemoteTraceback:
"""
Traceback (most recent call last):
File "/usr/lib64/python3.10/multiprocessing/pool.py", line 125, in worker
result = (True, func(*args, **kwds))
File "/usr/lib64/python3.10/multiprocessing/pool.py", line 48, in mapstar
return list(map(*args))
File "/data/LLM_training/translation/cc_net/dl_cc_matrix.py", line 138, in dl_file
raw_documents = get_documents(segment)
File "/data/LLM_training/translation/cc_net/dl_cc_matrix.py", line 107, in get_documents
return {d["digest"]: d["raw_content"] for d in CCSegmentsReader([segment])}
File "/data/LLM_training/translation/cc_net/dl_cc_matrix.py", line 107, in
return {d["digest"]: d["raw_content"] for d in CCSegmentsReader([segment])}
File "/data/LLM_training/translation/cc_net/cc_net/process_wet_file.py", line 199, in iter
for doc in parse_warc_file(self.open_segment(segment), self.min_len):
File "/data/LLM_training/translation/cc_net/cc_net/process_wet_file.py", line 192, in open_segment
return jsonql.open_remote_file(url, cache=file)
File "/data/LLM_training/translation/cc_net/cc_net/jsonql.py", line 1124, in open_remote_file
raw_bytes = request_get_content(url)
File "/data/LLM_training/translation/cc_net/cc_net/jsonql.py", line 1101, in request_get_content
raise e
File "/data/LLM_training/translation/cc_net/cc_net/jsonql.py", line 1095, in request_get_content
r.raise_for_status()
File "/home/user/.local/lib/python3.10/site-packages/requests/models.py", line 1021, in raise_for_status
raise HTTPError(http_error_msg, response=self)
requests.exceptions.HTTPError: 403 Client Error: Forbidden for url: https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2019-09/segments/1550247479101.30/wet/CC-MAIN-20190215183319-20190215205319-00001.warc.wet.gz
"""
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/data/LLM_training/translation/cc_net/dl_cc_matrix.py", line 338, in
func_argparse.main(dl, finalize)
File "/home/user/.local/lib/python3.10/site-packages/func_argparse/init.py", line 29, in main
return make_main(*fns, module=module, description=description)(sys.argv[1:])
File "/home/user/.local/lib/python3.10/site-packages/func_argparse/init.py", line 72, in parse_and_call
return command(**parsed_args)
File "/data/LLM_training/translation/cc_net/dl_cc_matrix.py", line 103, in dl
pool.map(dlf, file_list)
File "/usr/lib64/python3.10/multiprocessing/pool.py", line 367, in map
return self._map_async(func, iterable, mapstar, chunksize).get()
File "/usr/lib64/python3.10/multiprocessing/pool.py", line 774, in get
raise self._value
requests.exceptions.HTTPError: 403 Client Error: Forbidden for url: https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2019-09/segments/1550247479101.30/wet/CC-MAIN-20190215183319-20190215205319-00001.warc.wet.gz
The text was updated successfully, but these errors were encountered:
Hi! If some CommonCrawl files do not exist anymore, I am not sure it would be easy to find them.
Have you considered downloading CCMatrix from another storage, such as https://opus.nlpl.eu/CCMatrix/corpus/version/CCMatrix?
2024-02-14 21:01 INFO 2048692:root - Downloaded https://dl.fbaipublicfiles.com/laser/CCMatrix/v1.0.0/2020-10_0278.tsv.gz [200] took 8s (5766.4kB/s)
2024-02-14 21:01 INFO 2048692:root - Starting download of https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2020-10/segments/1581875145708.59/wet/CC-MAIN-20200222150029-20200222180029-00542.warc.wet.gz
2024-02-14 21:01 INFO 2048693:root - Downloaded https://dl.fbaipublicfiles.com/laser/CCMatrix/v1.0.0/2018-05_0044.tsv.gz [200] took 9s (5267.6kB/s)
2024-02-14 21:01 INFO 2048693:root - Starting download of https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2018-05/segments/1516084886639.11/wet/CC-MAIN-20180116184540-20180116204540-00601.warc.wet.gz
multiprocessing.pool.RemoteTraceback:
"""
Traceback (most recent call last):
File "/usr/lib64/python3.10/multiprocessing/pool.py", line 125, in worker
result = (True, func(*args, **kwds))
File "/usr/lib64/python3.10/multiprocessing/pool.py", line 48, in mapstar
return list(map(*args))
File "/data/LLM_training/translation/cc_net/dl_cc_matrix.py", line 138, in dl_file
raw_documents = get_documents(segment)
File "/data/LLM_training/translation/cc_net/dl_cc_matrix.py", line 107, in get_documents
return {d["digest"]: d["raw_content"] for d in CCSegmentsReader([segment])}
File "/data/LLM_training/translation/cc_net/dl_cc_matrix.py", line 107, in
return {d["digest"]: d["raw_content"] for d in CCSegmentsReader([segment])}
File "/data/LLM_training/translation/cc_net/cc_net/process_wet_file.py", line 199, in iter
for doc in parse_warc_file(self.open_segment(segment), self.min_len):
File "/data/LLM_training/translation/cc_net/cc_net/process_wet_file.py", line 192, in open_segment
return jsonql.open_remote_file(url, cache=file)
File "/data/LLM_training/translation/cc_net/cc_net/jsonql.py", line 1124, in open_remote_file
raw_bytes = request_get_content(url)
File "/data/LLM_training/translation/cc_net/cc_net/jsonql.py", line 1101, in request_get_content
raise e
File "/data/LLM_training/translation/cc_net/cc_net/jsonql.py", line 1095, in request_get_content
r.raise_for_status()
File "/home/user/.local/lib/python3.10/site-packages/requests/models.py", line 1021, in raise_for_status
raise HTTPError(http_error_msg, response=self)
requests.exceptions.HTTPError: 403 Client Error: Forbidden for url: https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2019-09/segments/1550247479101.30/wet/CC-MAIN-20190215183319-20190215205319-00001.warc.wet.gz
"""
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/data/LLM_training/translation/cc_net/dl_cc_matrix.py", line 338, in
func_argparse.main(dl, finalize)
File "/home/user/.local/lib/python3.10/site-packages/func_argparse/init.py", line 29, in main
return make_main(*fns, module=module, description=description)(sys.argv[1:])
File "/home/user/.local/lib/python3.10/site-packages/func_argparse/init.py", line 72, in parse_and_call
return command(**parsed_args)
File "/data/LLM_training/translation/cc_net/dl_cc_matrix.py", line 103, in dl
pool.map(dlf, file_list)
File "/usr/lib64/python3.10/multiprocessing/pool.py", line 367, in map
return self._map_async(func, iterable, mapstar, chunksize).get()
File "/usr/lib64/python3.10/multiprocessing/pool.py", line 774, in get
raise self._value
requests.exceptions.HTTPError: 403 Client Error: Forbidden for url: https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2019-09/segments/1550247479101.30/wet/CC-MAIN-20190215183319-20190215205319-00001.warc.wet.gz
The text was updated successfully, but these errors were encountered: