{"payload":{"pageCount":1,"repositories":[{"type":"Public","name":"crawlurlfrontier","owner":"DigitalPebble","isFork":false,"description":"Crawl config used to test URL Frontier on a large scale and produce WARCs for CommonCrawl.","allTopics":[],"primaryLanguage":{"name":"FLUX","color":"#88ccff"},"pullRequestCount":0,"issueCount":0,"starsCount":1,"forksCount":0,"license":null,"participation":null,"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2024-05-16T12:54:22.108Z"}},{"type":"Public","name":"storm","owner":"DigitalPebble","isFork":true,"description":"Mirror of Apache Storm","allTopics":[],"primaryLanguage":{"name":"Java","color":"#b07219"},"pullRequestCount":0,"issueCount":0,"starsCount":0,"forksCount":4071,"license":"Apache License 2.0","participation":null,"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2024-04-10T17:26:00.287Z"}},{"type":"Public","name":"tika-detector-stormcrawler","owner":"DigitalPebble","isFork":false,"description":"Wraps the charset detection logic from StormCrawler as a Tika module","allTopics":[],"primaryLanguage":{"name":"Java","color":"#b07219"},"pullRequestCount":0,"issueCount":0,"starsCount":0,"forksCount":1,"license":"Apache License 2.0","participation":null,"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2024-02-02T15:34:38.292Z"}},{"type":"Public","name":"tika","owner":"DigitalPebble","isFork":true,"description":"The Apache Tika toolkit detects and extracts metadata and text from over a thousand different file types (such as PPT, XLS, and PDF).","allTopics":[],"primaryLanguage":{"name":"Java","color":"#b07219"},"pullRequestCount":0,"issueCount":0,"starsCount":0,"forksCount":745,"license":"Apache License 2.0","participation":null,"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2024-01-25T09:39:50.293Z"}},{"type":"Public","name":"benchmark","owner":"DigitalPebble","isFork":false,"description":"StormCrawler topology to evaluate the performance of different backends and configurations","allTopics":["elasticsearch","benchmark","opensearch","stormcrawler"],"primaryLanguage":{"name":"Shell","color":"#89e051"},"pullRequestCount":0,"issueCount":0,"starsCount":0,"forksCount":0,"license":null,"participation":null,"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2024-01-22T15:16:59.949Z"}},{"type":"Public","name":"stormcrawler-docker","owner":"DigitalPebble","isFork":false,"description":"Resources for running StormCrawler with Docker services","allTopics":["docker","apache-storm","stormcrawler"],"primaryLanguage":{"name":"Dockerfile","color":"#384d54"},"pullRequestCount":0,"issueCount":0,"starsCount":7,"forksCount":2,"license":"Apache License 2.0","participation":null,"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2024-01-18T08:59:17.623Z"}},{"type":"Public","name":"docs","owner":"DigitalPebble","isFork":true,"description":"Documentation for Docker Official Images in docker-library","allTopics":[],"primaryLanguage":{"name":"Shell","color":"#89e051"},"pullRequestCount":0,"issueCount":0,"starsCount":0,"forksCount":2187,"license":"MIT License","participation":null,"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2024-01-16T10:18:34.847Z"}},{"type":"Public","name":"ansible-storm","owner":"DigitalPebble","isFork":false,"description":"Ansible playbook for deploying a Storm cluster","allTopics":["storm","playbook","stormcrawler","ansible"],"primaryLanguage":null,"pullRequestCount":0,"issueCount":0,"starsCount":7,"forksCount":1,"license":null,"participation":null,"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2023-12-07T16:38:15.276Z"}},{"type":"Public","name":"nutch","owner":"DigitalPebble","isFork":true,"description":"Apache Nutch is an extensible and scalable web crawler","allTopics":[],"primaryLanguage":{"name":"Java","color":"#b07219"},"pullRequestCount":0,"issueCount":0,"starsCount":1,"forksCount":1250,"license":"Apache License 2.0","participation":null,"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2023-11-08T13:53:36.930Z"}},{"type":"Public","name":"digitalpebble.github.io","owner":"DigitalPebble","isFork":false,"description":"Resources for the DigitalPebble website","allTopics":[],"primaryLanguage":{"name":"SCSS","color":"#c6538c"},"pullRequestCount":0,"issueCount":0,"starsCount":0,"forksCount":0,"license":null,"participation":null,"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2023-02-01T16:31:17.396Z"}},{"type":"Public","name":"urlfrontier-client","owner":"DigitalPebble","isFork":false,"description":"URLFrontier client written in Rust (mostly as a way of learning Rust)","allTopics":["rust","grpc","webcrawler","url-frontier","urlfrontier"],"primaryLanguage":{"name":"Rust","color":"#dea584"},"pullRequestCount":0,"issueCount":0,"starsCount":1,"forksCount":0,"license":"Apache License 2.0","participation":null,"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2022-12-05T22:18:11.357Z"}},{"type":"Public","name":"crawler4j-frontier-battle","owner":"DigitalPebble","isFork":true,"description":"","allTopics":[],"primaryLanguage":{"name":"Java","color":"#b07219"},"pullRequestCount":0,"issueCount":0,"starsCount":0,"forksCount":1,"license":null,"participation":null,"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2022-04-06T08:55:38.249Z"}},{"type":"Public","name":"TextClassification","owner":"DigitalPebble","isFork":false,"description":"A Text Classification API in Java originally developed by DigitalPebble Ltd. The API is independent from the ML implementations used and can be used as a front end to various ML algorithms. libSVM and liblinear are currently embedded. ","allTopics":[],"primaryLanguage":{"name":"Java","color":"#b07219"},"pullRequestCount":0,"issueCount":1,"starsCount":48,"forksCount":21,"license":"Apache License 2.0","participation":null,"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2021-09-24T13:26:48.453Z"}},{"type":"Public","name":"stormcrawlerfight","owner":"DigitalPebble","isFork":false,"description":"Crawl configurations for benchmarking / testing StormCrawler","allTopics":[],"primaryLanguage":{"name":"Shell","color":"#89e051"},"pullRequestCount":0,"issueCount":0,"starsCount":9,"forksCount":5,"license":"Apache License 2.0","participation":null,"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2019-09-19T05:43:32.163Z"}},{"type":"Public archive","name":"behemoth","owner":"DigitalPebble","isFork":false,"description":"Behemoth is an open source platform for large scale document analysis based on Apache Hadoop.","allTopics":["nlp","mapreduce","java","hadoop"],"primaryLanguage":{"name":"Java","color":"#b07219"},"pullRequestCount":1,"issueCount":12,"starsCount":282,"forksCount":60,"license":"Other","participation":null,"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2018-04-25T10:58:00.520Z"}},{"type":"Public","name":"crawler-commons","owner":"DigitalPebble","isFork":true,"description":"A set of reusable Java components that implement functionality common to any web crawler","allTopics":[],"primaryLanguage":{"name":"Java","color":"#b07219"},"pullRequestCount":0,"issueCount":0,"starsCount":4,"forksCount":74,"license":"Apache License 2.0","participation":null,"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2017-04-04T08:54:52.008Z"}},{"type":"Public","name":"sc-warc","owner":"DigitalPebble","isFork":false,"description":"WARC resources for StormCrawler","allTopics":[],"primaryLanguage":null,"pullRequestCount":0,"issueCount":3,"starsCount":2,"forksCount":1,"license":null,"participation":null,"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2016-10-20T08:26:50.117Z"}},{"type":"Public archive","name":"tescobank","owner":"DigitalPebble","isFork":false,"description":"Setup for crawling tescobank with SC","allTopics":[],"primaryLanguage":{"name":"Java","color":"#b07219"},"pullRequestCount":0,"issueCount":0,"starsCount":4,"forksCount":2,"license":"Apache License 2.0","participation":null,"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2015-09-23T09:16:13.161Z"}},{"type":"Public","name":"textclassification-examples","owner":"DigitalPebble","isFork":false,"description":"Use cases for DigitalPebble's TextClassification API","allTopics":[],"primaryLanguage":{"name":"Java","color":"#b07219"},"pullRequestCount":0,"issueCount":0,"starsCount":10,"forksCount":3,"license":"Apache License 2.0","participation":null,"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2015-09-01T13:48:48.414Z"}},{"type":"Public archive","name":"behemoth-commoncrawl","owner":"DigitalPebble","isFork":false,"description":"Support for old (pre 2013) CommonCrawl dataset in Behemoth ","allTopics":[],"primaryLanguage":{"name":"Java","color":"#b07219"},"pullRequestCount":0,"issueCount":0,"starsCount":4,"forksCount":0,"license":null,"participation":null,"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2015-04-20T14:37:48.016Z"}},{"type":"Public","name":"tika-cc","owner":"DigitalPebble","isFork":false,"description":"resources for generating a corpus of docs from CC for Tika","allTopics":[],"primaryLanguage":{"name":"Shell","color":"#89e051"},"pullRequestCount":0,"issueCount":0,"starsCount":0,"forksCount":0,"license":null,"participation":null,"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2014-11-28T10:46:58.697Z"}},{"type":"Public","name":"NutchFight","owner":"DigitalPebble","isFork":false,"description":"Resources for comparison between 1.8 and 2.x of Apache Nutch","allTopics":[],"primaryLanguage":{"name":"Java","color":"#b07219"},"pullRequestCount":0,"issueCount":0,"starsCount":4,"forksCount":0,"license":"Apache License 2.0","participation":null,"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2014-06-04T13:45:32.050Z"}},{"type":"Public archive","name":"behemoth-elasticsearch","owner":"DigitalPebble","isFork":false,"description":"ElasticSearch module for Behemoth","allTopics":[],"primaryLanguage":{"name":"Java","color":"#b07219"},"pullRequestCount":0,"issueCount":0,"starsCount":1,"forksCount":0,"license":null,"participation":null,"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2014-02-12T15:53:10.000Z"}},{"type":"Public archive","name":"behemoth-textclassification","owner":"DigitalPebble","isFork":false,"description":"Module for classifying Behemoth documents with a model from our Text Classification API","allTopics":[],"primaryLanguage":{"name":"Java","color":"#b07219"},"pullRequestCount":0,"issueCount":0,"starsCount":1,"forksCount":0,"license":null,"participation":null,"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2012-11-22T10:27:58.000Z"}},{"type":"Public archive","name":"TextClassificationPlugin","owner":"DigitalPebble","isFork":false,"description":"GATE Processing Resource wrapping DigitalPebble's TextClassification API","allTopics":[],"primaryLanguage":{"name":"Java","color":"#b07219"},"pullRequestCount":1,"issueCount":1,"starsCount":5,"forksCount":3,"license":null,"participation":null,"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2012-07-12T09:51:33.000Z"}},{"type":"Public archive","name":"ngrams-api","owner":"DigitalPebble","isFork":false,"description":"Java API for querying a N-Grams corpus. Uses Lucene for searching and indexing from the Google Web-1T format ","allTopics":[],"primaryLanguage":{"name":"Java","color":"#b07219"},"pullRequestCount":0,"issueCount":0,"starsCount":4,"forksCount":2,"license":"Other","participation":null,"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2012-04-27T12:50:47.000Z"}}],"repositoryCount":26,"userInfo":null,"searchable":true,"definitions":[],"typeFilters":[{"id":"all","text":"All"},{"id":"public","text":"Public"},{"id":"source","text":"Sources"},{"id":"fork","text":"Forks"},{"id":"archived","text":"Archived"},{"id":"template","text":"Templates"}],"compactMode":false},"title":"Repositories"}