{"payload":{"pageCount":1,"repositories":[{"type":"Public","name":"KeywordAnalysis","owner":"CI-Research","isFork":false,"description":"Word analysis, by domain, on the Common Crawl data set for the purpose of finding industry trends","allTopics":["wordcount","keyword-extraction","cluster-analysis","commoncrawl"],"primaryLanguage":null,"pullRequestCount":0,"issueCount":0,"starsCount":57,"forksCount":13,"license":null,"participation":null,"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2024-01-28T03:35:16.774Z"}},{"type":"Public","name":"CI-HiBench","owner":"CI-Research","isFork":false,"description":"Big Data benchmark from Intel called HiBench","allTopics":[],"primaryLanguage":{"name":"Shell","color":"#89e051"},"pullRequestCount":0,"issueCount":0,"starsCount":0,"forksCount":0,"license":null,"participation":null,"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2018-03-05T21:03:03.547Z"}},{"type":"Public","name":"CommonCrawlDocumentDownload","owner":"CI-Research","isFork":true,"description":"A small tool which uses the CommonCrawl URL Index to download documents with certain file types or mime-types for mass-testing of frameworks like Apache POI and Apache Tika","allTopics":[],"primaryLanguage":{"name":"Java","color":"#b07219"},"pullRequestCount":0,"issueCount":0,"starsCount":0,"forksCount":20,"license":"BSD 2-Clause \"Simplified\" License","participation":null,"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2018-02-09T05:55:17.411Z"}},{"type":"Public","name":"cdx-index-client","owner":"CI-Research","isFork":true,"description":"A command-line tool for using CommonCrawl Index API at http://index.commoncrawl.org/","allTopics":[],"primaryLanguage":{"name":"Python","color":"#3572A5"},"pullRequestCount":0,"issueCount":0,"starsCount":1,"forksCount":48,"license":"MIT License","participation":null,"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2018-02-09T05:48:39.096Z"}},{"type":"Public","name":"HiBench","owner":"CI-Research","isFork":true,"description":"HiBench is a big data benchmark suite.","allTopics":[],"primaryLanguage":{"name":"Java","color":"#b07219"},"pullRequestCount":0,"issueCount":0,"starsCount":0,"forksCount":756,"license":"Other","participation":null,"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2018-01-31T02:06:05.851Z"}},{"type":"Public","name":"spark-Jupyter-AWS","owner":"CI-Research","isFork":true,"description":"A guide on how to set up Jupyter with Pyspark painlessly on AWS EC2 clusters, with S3 I/O support","allTopics":[],"primaryLanguage":{"name":"Jupyter Notebook","color":"#DA5B0B"},"pullRequestCount":0,"issueCount":0,"starsCount":1,"forksCount":19,"license":null,"participation":null,"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2017-04-24T02:00:06.303Z"}},{"type":"Public","name":"dkpro-c4corpus","owner":"CI-Research","isFork":true,"description":"DKPro C4CorpusTools is a collection of tools for processing CommonCrawl corpus, including Creative Commons license detection, boilerplate removal, language detection, and near-duplicate removal.","allTopics":[],"primaryLanguage":{"name":"Java","color":"#b07219"},"pullRequestCount":0,"issueCount":0,"starsCount":0,"forksCount":8,"license":"Apache License 2.0","participation":null,"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2016-12-20T12:48:32.113Z"}},{"type":"Public","name":"common_crawl_index","owner":"CI-Research","isFork":true,"description":"Index URLs in Common Crawl ","allTopics":[],"primaryLanguage":{"name":"Python","color":"#3572A5"},"pullRequestCount":0,"issueCount":0,"starsCount":0,"forksCount":49,"license":null,"participation":null,"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2016-09-06T11:44:51.858Z"}},{"type":"Public","name":"commoncrawl-examples","owner":"CI-Research","isFork":true,"description":"A library of examples showing how to use the Common Crawl corpus.","allTopics":[],"primaryLanguage":{"name":"Java","color":"#b07219"},"pullRequestCount":0,"issueCount":0,"starsCount":0,"forksCount":45,"license":null,"participation":null,"lastUpdated":{"hasBeenPushedTo":true,"timestamp":"2016-08-05T13:01:51.537Z"}}],"repositoryCount":9,"userInfo":null,"searchable":true,"definitions":[],"typeFilters":[{"id":"all","text":"All"},{"id":"public","text":"Public"},{"id":"source","text":"Sources"},{"id":"fork","text":"Forks"},{"id":"archived","text":"Archived"},{"id":"template","text":"Templates"}],"compactMode":false},"title":"Repositories"}