Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add SitemapReader originally developed in OERSI #469

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 2 additions & 1 deletion metafacture-io/build.gradle
Expand Up @@ -22,10 +22,11 @@ dependencies {
implementation project(':metafacture-commons')
implementation 'commons-io:commons-io:2.5'
implementation 'org.apache.commons:commons-compress:1.21'
implementation 'org.jooq:joox-java-6:1.6.0'
implementation 'org.slf4j:slf4j-simple:1.7.21'
runtimeOnly 'org.tukaani:xz:1.6'
testImplementation 'com.github.tomakehurst:wiremock-jre8:2.33.2'
testImplementation 'junit:junit:4.12'
testImplementation 'org.mockito:mockito-core:2.5.5'
testImplementation 'org.assertj:assertj-core:3.11.1'
testRuntimeOnly 'org.slf4j:slf4j-simple:1.7.21'
}
132 changes: 132 additions & 0 deletions metafacture-io/src/main/java/org/metafacture/io/SitemapReader.java
@@ -0,0 +1,132 @@
/*
* Copyright 2020, 2022 Fabian Steeg, hbz
*
* Licensed under the Apache License, Version 2.0 the "License";
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.metafacture.io;

import org.metafacture.framework.FluxCommand;
import org.metafacture.framework.MetafactureException;
import org.metafacture.framework.ObjectReceiver;
import org.metafacture.framework.annotations.Description;
import org.metafacture.framework.annotations.In;
import org.metafacture.framework.annotations.Out;
import org.metafacture.framework.helpers.DefaultObjectPipe;

import org.joox.JOOX;
import org.joox.Match;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.SAXException;

import java.io.IOException;
import java.net.URL;
import java.util.List;
import java.util.Scanner;
import java.util.stream.Collectors;

/**
* Reads a sitemap and emits URLs.
*
* @author Fabian Steeg (fsteeg)
*/
@Description("Reads an XML sitemap from a URL, sends the sitemap's `loc` URLs to the receiver. " +
"If the sitemap URL contains a `from=` query string parameter, the reader will keep paging until no more results are returned. " +
"Set `filter` to send only URLs matching a given regular expression to the receiver (defaults to sending all URLs). " +
"Set `limit` to limit the total number of URLs to send to the receiver (defaults to sending all URLs, set explicitly with `-1`). " +
"Set `wait` for the time (in milliseconds) to wait after sending a URL to the receiver (defaults to `1000` i.e. 1 second).")
@In(String.class)
@Out(String.class)
@FluxCommand("read-sitemap")
public final class SitemapReader extends DefaultObjectPipe<String, ObjectReceiver<String>> {

private static final Logger LOG = LoggerFactory.getLogger(SitemapReader.class);
private static final int DEFAULT_WAIT = 1000;
private static final int DEFAULT_LIMIT = Integer.MAX_VALUE;

private String filter;
private int limit = DEFAULT_LIMIT;
private int wait = DEFAULT_WAIT;

/**
* Creates an instance of {@link SitemapReader}.
*/
public SitemapReader() { }

/**
* @param filter The regex to match for filtering which URLs should be sent to the receiver.
*/
public void setFilter(final String filter) {
this.filter = filter;
}

/**
* @param limit The total number of URLs that should be sent to the receiver (-1 for unlimited).
*/
public void setLimit(final int limit) {
this.limit = limit < 0 ? Integer.MAX_VALUE : limit;
}

/**
* @param wait The time (in milliseconds) to wait after a URL has been sent to the receiver.
*/
public void setWait(final int wait) {
this.wait = wait;
}

@Override
public void process(final String sitemap) {
LOG.debug("Processing sitemap URL {}", sitemap);
try {
final Match siteMapXml = JOOX.$(new URL(sitemap));
final List<String> urls = siteMapXml.find("loc")
.map(m -> m.element().getTextContent().trim()).stream()
.filter(s -> filter == null || s.matches(filter)).collect(Collectors.toList());
sendAll(urls);
tryNextPage(sitemap, urls.size());
}
catch (final SAXException | IOException e) {
throw new MetafactureException(e.getMessage(), e);
}
catch (final InterruptedException e) {
Thread.currentThread().interrupt();
throw new MetafactureException(e.getMessage(), e);
}
}

private void sendAll(final List<String> urls) throws InterruptedException {
for (final String url : urls.subList(0, Math.min(limit, urls.size()))) {
LOG.trace("Processing resource URL {}", url);
getReceiver().process(url);
Thread.sleep(wait);
}
}

private void tryNextPage(final String sitemap, final int currentPageSize) {
final String fromParam = "from=";
final boolean pagingIsSupported = sitemap.contains(fromParam);
final boolean isDone = currentPageSize == 0 || limit <= currentPageSize;
if (pagingIsSupported && !isDone) {
try (Scanner scanner = new Scanner(
sitemap.substring(sitemap.indexOf(fromParam) + fromParam.length()))) {
if (scanner.hasNextInt()) {
final int lastFrom = scanner.nextInt();
final int nextFrom = lastFrom + currentPageSize;
process(sitemap.replace(fromParam + lastFrom, fromParam + nextFrom));
}
}
}
}

}
1 change: 1 addition & 0 deletions metafacture-io/src/main/resources/flux-commands.properties
Expand Up @@ -22,3 +22,4 @@ write org.metafacture.io.ObjectWriter
as-records org.metafacture.io.RecordReader
open-resource org.metafacture.io.ResourceOpener
open-tar org.metafacture.io.TarReader
read-sitemap org.metafacture.io.SitemapReader
@@ -0,0 +1,83 @@
/*
* Copyright 2020, 2022 Fabian Steeg, hbz
*
* Licensed under the Apache License, Version 2.0 the "License";
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.metafacture.io;

import org.metafacture.framework.MetafactureException;
import org.metafacture.framework.ObjectReceiver;

import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import org.mockito.InOrder;
import org.mockito.Mock;
import org.mockito.Mockito;
import org.mockito.MockitoAnnotations;

import java.util.Arrays;
import java.util.List;

/**
* Tests for {@link SitemapReader}.
*
* @author Fabian Steeg
*
*/
public final class SitemapReaderTest {

private String sitemap = "sitemap.xml";
private SitemapReader sitemapReader;

@Mock
private ObjectReceiver<String> receiver;
private InOrder inOrder;

@Before
public void setup() {
MockitoAnnotations.initMocks(this);
sitemapReader = new SitemapReader();
sitemapReader.setWait(0); // we're not actually crawling any urls in the tests
sitemapReader.setReceiver(receiver);
inOrder = Mockito.inOrder(receiver);
}

@Test
public void testShouldProcessAll() {
sitemapReader.process(getClass().getResource(sitemap).toString());
inOrder.verify(receiver).process("https://www.oncampus.de/Customer_Experience_Management");
inOrder.verify(receiver).process("https://www.oncampus.de/Propädeutik_Mathe_Grundlagen");
inOrder.verify(receiver).process("https://www.oncampus.de/MDR/Websession2020");
inOrder.verifyNoMoreInteractions();
}

@Test
public void testShouldProcessPattern() {
sitemapReader.process(getClass().getResource(sitemap).toString());
sitemapReader.setFilter(".*/MDR/.*");
inOrder.verify(receiver).process("https://www.oncampus.de/MDR/Websession2020");
inOrder.verifyNoMoreInteractions();
}

@Test(expected = MetafactureException.class)
public void testShouldThrowOnInvalidUrl() {
sitemapReader.process("");
}

@After
public void cleanup() {
sitemapReader.closeStream();
}
}
12 changes: 12 additions & 0 deletions metafacture-io/src/test/resources/org/metafacture/io/sitemap.xml
@@ -0,0 +1,12 @@
<?xml version="1.0" encoding="utf-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
<loc>https://www.oncampus.de/Customer_Experience_Management</loc>
</url>
<url>
<loc>https://www.oncampus.de/Propädeutik_Mathe_Grundlagen</loc>
</url>
<url>
<loc>https://www.oncampus.de/MDR/Websession2020</loc>
</url>
</urlset>