{"payload":{"feedbackUrl":"https://github.com/orgs/community/discussions/53140","repo":{"id":295996968,"defaultBranch":"master","name":"warc2text","ownerLogin":"bitextor","currentUserCanPush":false,"isFork":false,"isEmpty":false,"createdAt":"2020-09-16T10:14:55.000Z","ownerAvatar":"https://avatars.githubusercontent.com/u/35965770?v=4","public":true,"private":false,"isOrgOwned":true},"refInfo":{"name":"","listCacheKey":"v0:1713532178.0","currentOid":""},"activityList":{"items":[{"before":"e26bda2d3f0db5bcf3330b4fc4dabba7e128bf22","after":"c0cc3fa330e84409d92123f7e162e831847005ca","ref":"refs/heads/master","pushedAt":"2024-05-08T14:07:29.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"ZJaume","name":"Jaume Zaragoza","path":"/ZJaume","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/11339330?s=80&v=4"},"commit":{"message":"Update installation instructions with needed dependencies\n\nList specific boost libraries used instead of libboost-all-dev","shortMessageHtmlLink":"Update installation instructions with needed dependencies"}},{"before":"d066592685c17f5efa2624029e6206f5a74db63f","after":"e26bda2d3f0db5bcf3330b4fc4dabba7e128bf22","ref":"refs/heads/master","pushedAt":"2024-05-03T13:25:03.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"ZJaume","name":"Jaume Zaragoza","path":"/ZJaume","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/11339330?s=80&v=4"},"commit":{"message":"Write json objects, not arrays","shortMessageHtmlLink":"Write json objects, not arrays"}},{"before":"77c514f2df9da14a0143876d6ba2b140ec2da60a","after":null,"ref":"refs/heads/nlohmann_json","pushedAt":"2024-04-19T13:09:38.000Z","pushType":"branch_deletion","commitsCount":0,"pusher":{"login":"ZJaume","name":"Jaume Zaragoza","path":"/ZJaume","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/11339330?s=80&v=4"}},{"before":"fc6ab2f68aaec82f56e340c7eb99d40a0ff357b5","after":"d066592685c17f5efa2624029e6206f5a74db63f","ref":"refs/heads/master","pushedAt":"2024-04-19T12:57:56.000Z","pushType":"pr_merge","commitsCount":3,"pusher":{"login":"ZJaume","name":"Jaume Zaragoza","path":"/ZJaume","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/11339330?s=80&v=4"},"commit":{"message":"Update minimum CMake version","shortMessageHtmlLink":"Update minimum CMake version"}},{"before":null,"after":"057ce06b1a3f5ba6c81e0cbda8585f100bbeecf3","ref":"refs/heads/nlohmann_json_fix","pushedAt":"2024-04-19T09:36:07.000Z","pushType":"branch_creation","commitsCount":0,"pusher":{"login":"ZJaume","name":"Jaume Zaragoza","path":"/ZJaume","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/11339330?s=80&v=4"},"commit":{"message":"Update minimum CMake version","shortMessageHtmlLink":"Update minimum CMake version"}},{"before":"2b77807d6c05316825c418b5c9f58771df87f92a","after":"77c514f2df9da14a0143876d6ba2b140ec2da60a","ref":"refs/heads/nlohmann_json","pushedAt":"2024-04-19T09:29:23.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"ZJaume","name":"Jaume Zaragoza","path":"/ZJaume","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/11339330?s=80&v=4"},"commit":{"message":"Fix text file not being written","shortMessageHtmlLink":"Fix text file not being written"}},{"before":"edb5a18b160eeeeb7059b90c43a1991f5551623e","after":"fc6ab2f68aaec82f56e340c7eb99d40a0ff357b5","ref":"refs/heads/master","pushedAt":"2024-04-18T09:47:33.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"ZJaume","name":"Jaume Zaragoza","path":"/ZJaume","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/11339330?s=80&v=4"},"commit":{"message":"Fix text file not being written","shortMessageHtmlLink":"Fix text file not being written"}},{"before":"f8a53b2ab2a04e08ac9c44dc35e92d022816eb98","after":"2b77807d6c05316825c418b5c9f58771df87f92a","ref":"refs/heads/nlohmann_json","pushedAt":"2024-04-16T11:12:38.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"ZJaume","name":"Jaume Zaragoza","path":"/ZJaume","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/11339330?s=80&v=4"},"commit":{"message":"Avoid segfault in html parser\n\nback() should not be called on an empty string, undefined behaviour.","shortMessageHtmlLink":"Avoid segfault in html parser"}},{"before":null,"after":"f8a53b2ab2a04e08ac9c44dc35e92d022816eb98","ref":"refs/heads/nlohmann_json","pushedAt":"2024-04-16T10:45:20.000Z","pushType":"branch_creation","commitsCount":0,"pusher":{"login":"ZJaume","name":"Jaume Zaragoza","path":"/ZJaume","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/11339330?s=80&v=4"},"commit":{"message":"Update minimum CMake version","shortMessageHtmlLink":"Update minimum CMake version"}},{"before":null,"after":"c9a40d92fa91c7eb5d5a22f4b1268ce675f05a34","ref":"refs/heads/http_status","pushedAt":"2024-04-10T16:00:16.000Z","pushType":"branch_creation","commitsCount":0,"pusher":{"login":"ZJaume","name":"Jaume Zaragoza","path":"/ZJaume","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/11339330?s=80&v=4"},"commit":{"message":"Filter by status code different than 20{0,3,6}","shortMessageHtmlLink":"Filter by status code different than 20{0,3,6}"}},{"before":"381bfde1d1e636a970134da23d82e0965d5d21f1","after":"edb5a18b160eeeeb7059b90c43a1991f5551623e","ref":"refs/heads/master","pushedAt":"2024-04-04T15:46:06.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"ZJaume","name":"Jaume Zaragoza","path":"/ZJaume","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/11339330?s=80&v=4"},"commit":{"message":"Avoid segfault in html parser\n\nback() should not be called on an empty string, undefined behaviour.","shortMessageHtmlLink":"Avoid segfault in html parser"}},{"before":"0af3085aad38cb5a0325e67d413b14ea714b1f9b","after":"381bfde1d1e636a970134da23d82e0965d5d21f1","ref":"refs/heads/master","pushedAt":"2024-04-04T15:10:42.000Z","pushType":"pr_merge","commitsCount":1,"pusher":{"login":"ZJaume","name":"Jaume Zaragoza","path":"/ZJaume","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/11339330?s=80&v=4"},"commit":{"message":"Fail when WARC file, tagfilters or urlfilters can't be opened","shortMessageHtmlLink":"Fail when WARC file, tagfilters or urlfilters can't be opened"}},{"before":null,"after":"096117b77f5a3e3c59909e0eb3504a071e413758","ref":"refs/heads/file_error","pushedAt":"2024-04-04T15:03:40.000Z","pushType":"branch_creation","commitsCount":0,"pusher":{"login":"ZJaume","name":"Jaume Zaragoza","path":"/ZJaume","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/11339330?s=80&v=4"},"commit":{"message":"Fail when WARC file, tagfilters or urlfilters can't be opened","shortMessageHtmlLink":"Fail when WARC file, tagfilters or urlfilters can't be opened"}},{"before":"6fffe62ee98d76a177b511556f01f41c76f8c071","after":"0af3085aad38cb5a0325e67d413b14ea714b1f9b","ref":"refs/heads/master","pushedAt":"2024-03-15T09:54:37.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"ZJaume","name":"Jaume Zaragoza","path":"/ZJaume","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/11339330?s=80&v=4"},"commit":{"message":"Update fasttext\n\nFixes wrong c++ standard flag in cmake","shortMessageHtmlLink":"Update fasttext"}},{"before":"549b5641d4806efe3409e9c1b5d0a7384cd218b6","after":"6fffe62ee98d76a177b511556f01f41c76f8c071","ref":"refs/heads/master","pushedAt":"2024-02-02T14:34:44.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"ZJaume","name":"Jaume Zaragoza","path":"/ZJaume","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/11339330?s=80&v=4"},"commit":{"message":"Update CLI parameters in help and README","shortMessageHtmlLink":"Update CLI parameters in help and README"}},{"before":"9d356fd4992500452b3a5cae818b95f5f8b23c1b","after":"549b5641d4806efe3409e9c1b5d0a7384cd218b6","ref":"refs/heads/master","pushedAt":"2024-02-02T14:18:06.000Z","pushType":"push","commitsCount":2,"pusher":{"login":"ZJaume","name":"Jaume Zaragoza","path":"/ZJaume","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/11339330?s=80&v=4"},"commit":{"message":"Rename metadata.jsonl.gz file to metadata.gz for consistency\n\nThe rest of the files only have the file extension for the compressed\nformat. So keep it that way.","shortMessageHtmlLink":"Rename metadata.jsonl.gz file to metadata.gz for consistency"}},{"before":"731084660934c95bd5113b193aacb4bd7d256493","after":"9d356fd4992500452b3a5cae818b95f5f8b23c1b","ref":"refs/heads/master","pushedAt":"2024-02-02T14:04:22.000Z","pushType":"pr_merge","commitsCount":1,"pusher":{"login":"ZJaume","name":"Jaume Zaragoza","path":"/ZJaume","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/11339330?s=80&v=4"},"commit":{"message":"Move JSONL output to --stdout and allow file-based output with JSONL\n\nThis changes the behaviour of the --jsonl option. Now the option only\nchanges the output format to JSONL but not the output location. To write\nJSONL with all the metadata and text to stdout use along with --stdout.\nOtherwise it will change the files that are base64 encoded (html, text)\nto jsonl format.\n\nThis change allows us in HPLT to take advantage of higher compression\nratios when saving HTML in escaped inside a JSON, instead of base64, who\nhas way worse compression ratios (as mentioned in #34).","shortMessageHtmlLink":"Move JSONL output to --stdout and allow file-based output with JSONL"}},{"before":"b601c1e4344f26dd997f18c9538e4ca23cc5f064","after":"8f6068b9973b8c5b15b3c6dd40988eb5c965f769","ref":"refs/heads/json_html","pushedAt":"2024-02-02T14:04:13.000Z","pushType":"force_push","commitsCount":0,"pusher":{"login":"ZJaume","name":"Jaume Zaragoza","path":"/ZJaume","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/11339330?s=80&v=4"},"commit":{"message":"Move JSONL output to --stdout and allow file-based output with JSONL\n\nThis changes the behaviour of the --jsonl option. Now the option only\nchanges the output format to JSONL but not the output location. To write\nJSONL with all the metadata and text to stdout use along with --stdout.\nOtherwise it will change the files that are base64 encoded (html, text)\nto jsonl format.\n\nThis change allows us in HPLT to take advantage of higher compression\nratios when saving HTML in escaped inside a JSON, instead of base64, who\nhas way worse compression ratios (as mentioned in #34).","shortMessageHtmlLink":"Move JSONL output to --stdout and allow file-based output with JSONL"}},{"before":"f4e2a8303add5d10c115fb91a811efb6163b291b","after":"b601c1e4344f26dd997f18c9538e4ca23cc5f064","ref":"refs/heads/json_html","pushedAt":"2024-02-02T14:03:34.000Z","pushType":"force_push","commitsCount":0,"pusher":{"login":"ZJaume","name":"Jaume Zaragoza","path":"/ZJaume","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/11339330?s=80&v=4"},"commit":{"message":"Move JSONL output to --stdout and allow file-based output with JSONL\n\nThis changes the behaviour of the --jsonl option. Now the option only\nchanges the output format to JSONL but not the output location. To write\nJSONL with all the metadata and text to stdout use along with --stdout.\nOtherwise it will change the files that are base64 encoded (html, text)\nto jsonl format.\n\nThis change allows us in HPLT to take advantage of higher compression\nratios when saving HTML in escaped inside a JSON, instead of base64, who\nhas way worse compression ratios (as mentioned in #34).","shortMessageHtmlLink":"Move JSONL output to --stdout and allow file-based output with JSONL"}},{"before":"71d5a3d9c83dda6580a29a220f62630138c4cf73","after":"731084660934c95bd5113b193aacb4bd7d256493","ref":"refs/heads/master","pushedAt":"2024-02-02T14:02:16.000Z","pushType":"pr_merge","commitsCount":1,"pusher":{"login":"ZJaume","name":"Jaume Zaragoza","path":"/ZJaume","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/11339330?s=80&v=4"},"commit":{"message":"ZSTD compression support\n\nThis adds both --compress and --compress-level options to select\ncompression algorithm and level.\n\nReplaces GzipWriter with a generic class CompressWriter that uses boost\nfiltering_stream api to easily switch between algorithms.","shortMessageHtmlLink":"ZSTD compression support"}},{"before":"53709d06456b8e357fe93ce6ebb12b078ecd3084","after":"1025e4d003b631d61c6d286a43a46698b03913d1","ref":"refs/heads/zstd","pushedAt":"2024-02-02T14:01:52.000Z","pushType":"force_push","commitsCount":0,"pusher":{"login":"ZJaume","name":"Jaume Zaragoza","path":"/ZJaume","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/11339330?s=80&v=4"},"commit":{"message":"ZSTD compression support\n\nThis adds both --compress and --compress-level options to select\ncompression algorithm and level.\n\nReplaces GzipWriter with a generic class CompressWriter that uses boost\nfiltering_stream api to easily switch between algorithms.","shortMessageHtmlLink":"ZSTD compression support"}},{"before":"6a514b467238d6ee4b865a99bba022b850faa7ea","after":"71d5a3d9c83dda6580a29a220f62630138c4cf73","ref":"refs/heads/master","pushedAt":"2024-02-02T13:56:22.000Z","pushType":"pr_merge","commitsCount":5,"pusher":{"login":"ZJaume","name":"Jaume Zaragoza","path":"/ZJaume","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/11339330?s=80&v=4"},"commit":{"message":"Convert the whole html to utf8 when text extraction is skipped","shortMessageHtmlLink":"Convert the whole html to utf8 when text extraction is skipped"}},{"before":null,"after":"f4e2a8303add5d10c115fb91a811efb6163b291b","ref":"refs/heads/json_html","pushedAt":"2024-01-25T16:42:46.000Z","pushType":"branch_creation","commitsCount":0,"pusher":{"login":"ZJaume","name":"Jaume Zaragoza","path":"/ZJaume","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/11339330?s=80&v=4"},"commit":{"message":"Move JSONL output to --stdout and allow file-based output with JSONL\n\nThis changes the behaviour of the --jsonl option. Now the option only\nchanges the output format to JSONL but not the output location. To write\nJSONL with all the metadata and text to stdout use along with --stdout.\nOtherwise it will change the files that are base64 encoded (html, text)\nto jsonl format.\n\nThis change allows us in HPLT to take advantage of higher compression\nratios when saving HTML in escaped inside a JSON, instead of base64, who\nhas way worse compression ratios (as mentioned in #34).","shortMessageHtmlLink":"Move JSONL output to --stdout and allow file-based output with JSONL"}},{"before":"ffb7b0d07dc8eea0646396ccb53898e4d96058da","after":"53709d06456b8e357fe93ce6ebb12b078ecd3084","ref":"refs/heads/zstd","pushedAt":"2024-01-25T11:23:40.000Z","pushType":"force_push","commitsCount":0,"pusher":{"login":"ZJaume","name":"Jaume Zaragoza","path":"/ZJaume","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/11339330?s=80&v=4"},"commit":{"message":"ZSTD compression support\n\nThis adds both --compress and --compress-level options to select\ncompression algorithm and level.\n\nReplaces GzipWriter with a generic class CompressWriter that uses boost\nfiltering_stream api to easily switch between algorithms.","shortMessageHtmlLink":"ZSTD compression support"}},{"before":"90a6b8f151103ab5156696f7f29eaadc6a120b86","after":"8e8c660ca609b647d4e883864e5cd82e89862078","ref":"refs/heads/warc2html","pushedAt":"2024-01-25T11:22:17.000Z","pushType":"force_push","commitsCount":0,"pusher":{"login":"ZJaume","name":"Jaume Zaragoza","path":"/ZJaume","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/11339330?s=80&v=4"},"commit":{"message":"Convert the whole html to utf8 when text extraction is skipped","shortMessageHtmlLink":"Convert the whole html to utf8 when text extraction is skipped"}},{"before":null,"after":"ffb7b0d07dc8eea0646396ccb53898e4d96058da","ref":"refs/heads/zstd","pushedAt":"2024-01-25T10:42:25.000Z","pushType":"branch_creation","commitsCount":0,"pusher":{"login":"ZJaume","name":"Jaume Zaragoza","path":"/ZJaume","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/11339330?s=80&v=4"},"commit":{"message":"ZSTD compression support\n\nThis adds both --compress and --compress-level options to select\ncompression algorithm and level.\n\nReplaces GzipWriter with a generic class CompressWriter that uses boost\nfiltering_stream api to easily switch between algorithms.","shortMessageHtmlLink":"ZSTD compression support"}},{"before":"a8870ce39bba33fb1067966101142d96363aedd4","after":"90a6b8f151103ab5156696f7f29eaadc6a120b86","ref":"refs/heads/warc2html","pushedAt":"2024-01-25T10:12:52.000Z","pushType":"push","commitsCount":2,"pusher":{"login":"ZJaume","name":"Jaume Zaragoza","path":"/ZJaume","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/11339330?s=80&v=4"},"commit":{"message":"Convert the whole html to utf8 when text extraction is skipped","shortMessageHtmlLink":"Convert the whole html to utf8 when text extraction is skipped"}},{"before":null,"after":"a8870ce39bba33fb1067966101142d96363aedd4","ref":"refs/heads/warc2html","pushedAt":"2024-01-23T11:27:41.000Z","pushType":"branch_creation","commitsCount":0,"pusher":{"login":"ZJaume","name":"Jaume Zaragoza","path":"/ZJaume","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/11339330?s=80&v=4"},"commit":{"message":"New output file containing all the metadata in JSON format","shortMessageHtmlLink":"New output file containing all the metadata in JSON format"}},{"before":"7cec35707e21ed872512198bc5f6d278ced73f9c","after":"6a514b467238d6ee4b865a99bba022b850faa7ea","ref":"refs/heads/master","pushedAt":"2024-01-22T11:24:18.000Z","pushType":"pr_merge","commitsCount":21,"pusher":{"login":"ZJaume","name":"Jaume Zaragoza","path":"/ZJaume","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/11339330?s=80&v=4"},"commit":{"message":"Merge pull request #35 from jelmervdl/metadata-only\n\nAdd `--jsonl` option","shortMessageHtmlLink":"Merge pull request #35 from jelmervdl/metadata-only"}},{"before":"2b410a1fc68f43cdb24c87ac961d31894acd1442","after":"c4698011a79e7515688ad6f3cf41d25ef91a1ace","ref":"refs/heads/domain-filter","pushedAt":"2024-01-11T17:27:48.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"ZJaume","name":"Jaume Zaragoza","path":"/ZJaume","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/11339330?s=80&v=4"},"commit":{"message":"Read domain list compressed\n\nThis reduces significantly the reading time. Before this it took a\ncouple of seconds, now the time is less than 1 second.","shortMessageHtmlLink":"Read domain list compressed"}}],"hasNextPage":true,"hasPreviousPage":false,"activityType":"all","actor":null,"timePeriod":"all","sort":"DESC","perPage":30,"cursor":"djE6ks8AAAAERLN9YQA","startCursor":null,"endCursor":null}},"title":"Activity ยท bitextor/warc2text"}