Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add disallowed_domains option to OffsiteMiddleware #5922

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
103 changes: 77 additions & 26 deletions scrapy/spidermiddlewares/offsite.py
Expand Up @@ -56,32 +56,83 @@ def should_follow(self, request, spider):
return bool(regex.search(host))

def get_host_regex(self, spider):
"""Override this method to implement a different offsite policy"""
allowed_domains = getattr(spider, "allowed_domains", None)
if not allowed_domains:
return re.compile("") # allow all by default
url_pattern = re.compile(r"^https?://.*$")
port_pattern = re.compile(r":\d+$")
domains = []
for domain in allowed_domains:
if domain is None:
continue
if url_pattern.match(domain):
message = (
"allowed_domains accepts only domains, not URLs. "
f"Ignoring URL entry {domain} in allowed_domains."
)
warnings.warn(message, URLWarning)
elif port_pattern.search(domain):
message = (
"allowed_domains accepts only domains without ports. "
f"Ignoring entry {domain} in allowed_domains."
)
warnings.warn(message, PortWarning)
else:
domains.append(re.escape(domain))
regex = rf'^(.*\.)?({"|".join(domains)})$'
return re.compile(regex)
"""Override this method to implement a different offsite policy.

Returns a compiled regular expression object that matches the hosts that
are allowed to be crawled. If None is returned (or method is not overridden),
all hosts are allowed.

Example:
allowed_domains = ['example.com']
disallowed_domains = ['example2.com']

This will allow crawling all subdomains of example.com (eg. foo.example.com,
bar.example.com). But it won't allow crawling example2.com or any subdomain
(eg. www.example2.com).
"""
allowed_domains_arg = getattr(spider, "allowed_domains", [])
disallowed_domains_arg = getattr(spider, "disallowed_domains", [])
# Filtered domains to be added to the regex pattern
allowed_domains = []
disallowed_domains = []

url_pattern = re.compile(r"^https?://.*$") # match http://example.com
port_pattern = re.compile(r":\d+$") # match :8080

def process_domains(domains_list=[], domains_type="allowed_domains"):
"""
Process the domains list and return a list of valid domains.
The arguments passed to the spider in allowed_domains and disallowed_domains
cannot be URLs or contain ports.
"""
valid_domains = []

for domain in domains_list:
if domain is None:
continue
if url_pattern.match(domain):
message = (
f"{domains_type} accepts only domains, not URLs. "
f"Ignoring URL entry {domain} in {domains_type}."
)
warnings.warn(message, URLWarning)
elif port_pattern.search(domain):
message = (
f"{domains_type} accepts only domains without ports. "
f"Ignoring entry {domain} in {domains_type}."
)
warnings.warn(message, PortWarning)
else:
valid_domains.append(re.escape(domain))
return valid_domains

if allowed_domains_arg:
allowed_domains = process_domains(allowed_domains_arg, "allowed_domains")

if disallowed_domains_arg:
disallowed_domains = process_domains(
disallowed_domains_arg, "disallowed_domains"
)

# match domains in the `allowed_domains` list
if allowed_domains:
allowed_domain_pattern = rf"^(.*\.)?({'|'.join(allowed_domains)})$"
else:
allowed_domain_pattern = ""

# exclude domains in the `disallowed_domains` list
if disallowed_domains:
disallowed_domain_pattern = rf"^(?!.*(?:{'|'.join(disallowed_domains)}))$"
else:
disallowed_domain_pattern = ""

# Concatenate the two patterns with the "|" (or) operator
if allowed_domain_pattern and disallowed_domain_pattern:
combined_pattern = rf"{allowed_domain_pattern}|{disallowed_domain_pattern}"
else:
combined_pattern = allowed_domain_pattern or disallowed_domain_pattern

return re.compile(combined_pattern)

def spider_opened(self, spider):
self.host_regex = self.get_host_regex(spider)
Expand Down