Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
repat committed Sep 5, 2019
0 parents commit 813152e
Show file tree
Hide file tree
Showing 13 changed files with 1,029 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .gitignore
@@ -0,0 +1,3 @@
build
composer.lock
vendor
22 changes: 22 additions & 0 deletions LICENSE
@@ -0,0 +1,22 @@
[MIT LICENSE]

Copyright (c) 2019 repat, https://repat.de

Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
Software), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, andor sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions

The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED AS IS, WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
61 changes: 61 additions & 0 deletions README.md
@@ -0,0 +1,61 @@
# spatie-crawler-redis
[![Latest Version on Packagist](https://img.shields.io/packagist/v/repat/spatie-crawler-redis.svg?style=flat-square)](https://packagist.org/packages/repat/spatie-crawler-redis)
[![Total Downloads](https://img.shields.io/packagist/dt/repat/spatie-crawler-redis.svg?style=flat-square)](https://packagist.org/packages/repat/spatie-crawler-redis)

**spatie-crawler-redis** is an alternative CrawlerQueue implementing the `Spatie\Crawler\CrawlQueue\CrawlQueue` interface using Redis Hashes.

## Installation
`$ composer require repat/spatie-crawler-redis`

## Example
Create a `Predis\Client` beforehand if you need options, such as selecting a database. If you don't pass a client, a new one without options will be used. Predis assumes `127.0.0.1`, `6379` and `0` as default host, port and database. You can also pass a custom prefix, otherwise `uniqid()` will be used.

```php
// see https://github.com/nrk/predis for options
$options = [
'database' => 7,
];

$prefix = uniqid() . ':'; // same as passing no prefix

$redisClient = new \Predis\Client($options);

// ...
->setCrawlQueue(new RedisCrawlQueue($redisClient, $prefix))

// uses new \Predis\Client without options
->setCrawlQueue(new RedisCrawlQueue())
```

## TODO
* `phpredis` support

## Testing
> Thanks spatie for the tests. These are the instructions:
To run the tests you'll have to start the included node based server first in a separate terminal window.

```bash
cd tests/server
npm install
./start_server.sh
```

With the server running, you can start testing.
```bash
vendor/bin/phpunit
```

## License
* MIT, see [LICENSE](https://github.com/repat/spatie-crawler-redis/blob/master/LICENSE)

## Version
* Version 0.1

## Contact
#### repat
* Homepage: https://repat.de
* e-mail: repat@repat.de
* Twitter: [@repat123](https://twitter.com/repat123 "repat123 on twitter")

[![Flattr this git repo](http://api.flattr.com/button/flattr-badge-large.png)](https://flattr.com/submit/auto?user_id=repat&url=https://github.com/repat/spatie-crawler-redis&title=spatie-crawler-redis&language=&tags=github&category=software)
30 changes: 30 additions & 0 deletions composer.json
@@ -0,0 +1,30 @@
{
"name": "repat/spatie-crawler-redis",
"description": "Redis CrawlQueue for spatie/crawler",
"keywords": ["spatie", "crawler", "redis", "crawlqueue", "predis"],
"homepage": "https://repat.de",
"license": "MIT",
"version" : "0.1",
"authors": [
{"name": "repat", "email": "repat@repat.de"}
],
"require": {
"php": ">=7.1",
"spatie/crawler": "^4.6",
"predis/predis": "^1.1"
},
"require-dev": {
"phpunit/phpunit": "^7.0",
"larapack/dd": "^1.1"
},
"autoload": {
"psr-4": {
"Repat\\CrawlQueue\\": "src/Repat/CrawlQueue"
}
},
"autoload-dev": {
"psr-4": {
"Spatie\\Crawler\\Test\\": "tests"
}
}
}
29 changes: 29 additions & 0 deletions phpunit.xml.dist
@@ -0,0 +1,29 @@
<?xml version="1.0" encoding="UTF-8"?>
<phpunit bootstrap="vendor/autoload.php"
backupGlobals="false"
backupStaticAttributes="false"
colors="true"
verbose="true"
convertErrorsToExceptions="true"
convertNoticesToExceptions="true"
convertWarningsToExceptions="true"
processIsolation="false"
stopOnFailure="false">
<testsuites>
<testsuite name="League Test Suite">
<directory>tests</directory>
</testsuite>
</testsuites>
<filter>
<whitelist>
<directory suffix=".php">src/</directory>
</whitelist>
</filter>
<logging>
<log type="tap" target="build/report.tap"/>
<log type="junit" target="build/report.junit.xml"/>
<log type="coverage-html" target="build/coverage"/>
<log type="coverage-text" target="build/coverage.txt"/>
<log type="coverage-clover" target="build/logs/clover.xml"/>
</logging>
</phpunit>
133 changes: 133 additions & 0 deletions src/Repat/CrawlQueue/RedisCrawlQueue.php
@@ -0,0 +1,133 @@
<?php

namespace Repat\CrawlQueue;

use Predis\Client;
use Psr\Http\Message\UriInterface;
use Spatie\Crawler\CrawlUrl;
use Spatie\Crawler\Exception\InvalidUrl;
use Spatie\Crawler\Exception\UrlNotFoundByIndex;
use Spatie\Crawler\CrawlQueue\CrawlQueue;

/**
* Implementation of CrawlQueue using Redis Hashes
*/
class RedisCrawlQueue implements CrawlQueue
{
// All known URLs, indexed by URL string.
const URLS = 'urls';
// Pending URLs, indexed by URL string.
const PENDING_URLS = 'pending';

/**
* Redis Instance
* @var \Predis\Client
*/
private $redis;

/**
* Prefix for this crawl
* @var string
*/
private $prefix;

public function __construct(?Client $redis = null, ?string $prefix = null)
{
$this->redis = $redis;
if (is_null($redis)) {
$this->redis = new Client();
}

$this->prefix = $prefix ?? uniqid() . ':';

// make sure prefix has a colon at the end
if (substr($this->prefix, -1) !== ':') {
$this->prefix .= ':';
}
}

public function __destruct()
{
$keys = $this->redis->hkeys(self::URLS);
foreach ($keys as $key) {
// if key is prefixed
//
if (substr($key, 0, strlen($this->prefix)) === $this->prefix) {
$this->redis->hdel(self::URLS, $key);
}
}
}

public function add(CrawlUrl $url) : CrawlQueue
{
$urlString = (string) $url->url;

if (!$this->has($urlString)) {
$url->setId($this->prefix . $urlString);

$this->redis->hset(self::URLS, $this->prefix . $urlString, serialize($url));
$this->redis->hset(self::PENDING_URLS, $this->prefix . $urlString, serialize($url));
}

return $this;
}

public function has($crawlUrl) : bool
{
if ($crawlUrl instanceof CrawlUrl) {
$url = $this->prefix . (string) $crawlUrl->url;
} elseif ($crawlUrl instanceof UriInterface) {
$url = $this->prefix . (string) $crawlUrl;
} elseif (is_string($crawlUrl)) {
$url = $crawlUrl;
} else {
throw InvalidUrl::unexpectedType($crawlUrl);
}

return (bool) $this->redis->hexists(self::URLS, $url);
}

public function hasPendingUrls() : bool
{
return (bool) $this->redis->hlen(self::PENDING_URLS);
}

public function getUrlById($id) : CrawlUrl
{
if (!$this->has($id)) {
throw new UrlNotFoundByIndex("Crawl url {$id} not found in hashes.");
}
return unserialize($this->redis->hget(self::URLS, $id));
}

public function getFirstPendingUrl() : ?CrawlUrl
{
$keys = $this->redis->hkeys(self::PENDING_URLS);

foreach ($keys as $key) {
return unserialize($this->redis->hget(self::PENDING_URLS, $key));
}

return null;
}

public function hasAlreadyBeenProcessed(CrawlUrl $url) : bool
{
$url = (string) $url->url;

if ($this->redis->hexists(self::PENDING_URLS, $this->prefix . $url)) {
return false;
}

if ($this->redis->hexists(self::URLS, $this->prefix . $url)) {
return true;
}

return false;
}

public function markAsProcessed(CrawlUrl $crawlUrl)
{
$this->redis->hdel(self::PENDING_URLS, $this->prefix . (string) $crawlUrl->url);
}
}

0 comments on commit 813152e

Please sign in to comment.