Edit File by line
/home/barbar84/public_h.../wp-conte.../plugins/sujqvwi/ShExBy/shex_roo.../lib64/python3..../urllib
File: robotparser.py
""" robotparser.py
[0] Fix | Delete
[1] Fix | Delete
Copyright (C) 2000 Bastian Kleineidam
[2] Fix | Delete
[3] Fix | Delete
You can choose between two licenses when using this package:
[4] Fix | Delete
1) GNU GPLv2
[5] Fix | Delete
2) PSF license for Python 2.2
[6] Fix | Delete
[7] Fix | Delete
The robots.txt Exclusion Protocol is implemented as specified in
[8] Fix | Delete
http://www.robotstxt.org/norobots-rfc.txt
[9] Fix | Delete
"""
[10] Fix | Delete
[11] Fix | Delete
import collections
[12] Fix | Delete
import urllib.parse
[13] Fix | Delete
import urllib.request
[14] Fix | Delete
[15] Fix | Delete
__all__ = ["RobotFileParser"]
[16] Fix | Delete
[17] Fix | Delete
RequestRate = collections.namedtuple("RequestRate", "requests seconds")
[18] Fix | Delete
[19] Fix | Delete
[20] Fix | Delete
class RobotFileParser:
[21] Fix | Delete
""" This class provides a set of methods to read, parse and answer
[22] Fix | Delete
questions about a single robots.txt file.
[23] Fix | Delete
[24] Fix | Delete
"""
[25] Fix | Delete
[26] Fix | Delete
def __init__(self, url=''):
[27] Fix | Delete
self.entries = []
[28] Fix | Delete
self.default_entry = None
[29] Fix | Delete
self.disallow_all = False
[30] Fix | Delete
self.allow_all = False
[31] Fix | Delete
self.set_url(url)
[32] Fix | Delete
self.last_checked = 0
[33] Fix | Delete
[34] Fix | Delete
def mtime(self):
[35] Fix | Delete
"""Returns the time the robots.txt file was last fetched.
[36] Fix | Delete
[37] Fix | Delete
This is useful for long-running web spiders that need to
[38] Fix | Delete
check for new robots.txt files periodically.
[39] Fix | Delete
[40] Fix | Delete
"""
[41] Fix | Delete
return self.last_checked
[42] Fix | Delete
[43] Fix | Delete
def modified(self):
[44] Fix | Delete
"""Sets the time the robots.txt file was last fetched to the
[45] Fix | Delete
current time.
[46] Fix | Delete
[47] Fix | Delete
"""
[48] Fix | Delete
import time
[49] Fix | Delete
self.last_checked = time.time()
[50] Fix | Delete
[51] Fix | Delete
def set_url(self, url):
[52] Fix | Delete
"""Sets the URL referring to a robots.txt file."""
[53] Fix | Delete
self.url = url
[54] Fix | Delete
self.host, self.path = urllib.parse.urlparse(url)[1:3]
[55] Fix | Delete
[56] Fix | Delete
def read(self):
[57] Fix | Delete
"""Reads the robots.txt URL and feeds it to the parser."""
[58] Fix | Delete
try:
[59] Fix | Delete
f = urllib.request.urlopen(self.url)
[60] Fix | Delete
except urllib.error.HTTPError as err:
[61] Fix | Delete
if err.code in (401, 403):
[62] Fix | Delete
self.disallow_all = True
[63] Fix | Delete
elif err.code >= 400 and err.code < 500:
[64] Fix | Delete
self.allow_all = True
[65] Fix | Delete
else:
[66] Fix | Delete
raw = f.read()
[67] Fix | Delete
self.parse(raw.decode("utf-8").splitlines())
[68] Fix | Delete
[69] Fix | Delete
def _add_entry(self, entry):
[70] Fix | Delete
if "*" in entry.useragents:
[71] Fix | Delete
# the default entry is considered last
[72] Fix | Delete
if self.default_entry is None:
[73] Fix | Delete
# the first default entry wins
[74] Fix | Delete
self.default_entry = entry
[75] Fix | Delete
else:
[76] Fix | Delete
self.entries.append(entry)
[77] Fix | Delete
[78] Fix | Delete
def parse(self, lines):
[79] Fix | Delete
"""Parse the input lines from a robots.txt file.
[80] Fix | Delete
[81] Fix | Delete
We allow that a user-agent: line is not preceded by
[82] Fix | Delete
one or more blank lines.
[83] Fix | Delete
"""
[84] Fix | Delete
# states:
[85] Fix | Delete
# 0: start state
[86] Fix | Delete
# 1: saw user-agent line
[87] Fix | Delete
# 2: saw an allow or disallow line
[88] Fix | Delete
state = 0
[89] Fix | Delete
entry = Entry()
[90] Fix | Delete
[91] Fix | Delete
self.modified()
[92] Fix | Delete
for line in lines:
[93] Fix | Delete
if not line:
[94] Fix | Delete
if state == 1:
[95] Fix | Delete
entry = Entry()
[96] Fix | Delete
state = 0
[97] Fix | Delete
elif state == 2:
[98] Fix | Delete
self._add_entry(entry)
[99] Fix | Delete
entry = Entry()
[100] Fix | Delete
state = 0
[101] Fix | Delete
# remove optional comment and strip line
[102] Fix | Delete
i = line.find('#')
[103] Fix | Delete
if i >= 0:
[104] Fix | Delete
line = line[:i]
[105] Fix | Delete
line = line.strip()
[106] Fix | Delete
if not line:
[107] Fix | Delete
continue
[108] Fix | Delete
line = line.split(':', 1)
[109] Fix | Delete
if len(line) == 2:
[110] Fix | Delete
line[0] = line[0].strip().lower()
[111] Fix | Delete
line[1] = urllib.parse.unquote(line[1].strip())
[112] Fix | Delete
if line[0] == "user-agent":
[113] Fix | Delete
if state == 2:
[114] Fix | Delete
self._add_entry(entry)
[115] Fix | Delete
entry = Entry()
[116] Fix | Delete
entry.useragents.append(line[1])
[117] Fix | Delete
state = 1
[118] Fix | Delete
elif line[0] == "disallow":
[119] Fix | Delete
if state != 0:
[120] Fix | Delete
entry.rulelines.append(RuleLine(line[1], False))
[121] Fix | Delete
state = 2
[122] Fix | Delete
elif line[0] == "allow":
[123] Fix | Delete
if state != 0:
[124] Fix | Delete
entry.rulelines.append(RuleLine(line[1], True))
[125] Fix | Delete
state = 2
[126] Fix | Delete
elif line[0] == "crawl-delay":
[127] Fix | Delete
if state != 0:
[128] Fix | Delete
# before trying to convert to int we need to make
[129] Fix | Delete
# sure that robots.txt has valid syntax otherwise
[130] Fix | Delete
# it will crash
[131] Fix | Delete
if line[1].strip().isdigit():
[132] Fix | Delete
entry.delay = int(line[1])
[133] Fix | Delete
state = 2
[134] Fix | Delete
elif line[0] == "request-rate":
[135] Fix | Delete
if state != 0:
[136] Fix | Delete
numbers = line[1].split('/')
[137] Fix | Delete
# check if all values are sane
[138] Fix | Delete
if (len(numbers) == 2 and numbers[0].strip().isdigit()
[139] Fix | Delete
and numbers[1].strip().isdigit()):
[140] Fix | Delete
entry.req_rate = RequestRate(int(numbers[0]), int(numbers[1]))
[141] Fix | Delete
state = 2
[142] Fix | Delete
if state == 2:
[143] Fix | Delete
self._add_entry(entry)
[144] Fix | Delete
[145] Fix | Delete
def can_fetch(self, useragent, url):
[146] Fix | Delete
"""using the parsed robots.txt decide if useragent can fetch url"""
[147] Fix | Delete
if self.disallow_all:
[148] Fix | Delete
return False
[149] Fix | Delete
if self.allow_all:
[150] Fix | Delete
return True
[151] Fix | Delete
# Until the robots.txt file has been read or found not
[152] Fix | Delete
# to exist, we must assume that no url is allowable.
[153] Fix | Delete
# This prevents false positives when a user erroneously
[154] Fix | Delete
# calls can_fetch() before calling read().
[155] Fix | Delete
if not self.last_checked:
[156] Fix | Delete
return False
[157] Fix | Delete
# search for given user agent matches
[158] Fix | Delete
# the first match counts
[159] Fix | Delete
parsed_url = urllib.parse.urlparse(urllib.parse.unquote(url))
[160] Fix | Delete
url = urllib.parse.urlunparse(('','',parsed_url.path,
[161] Fix | Delete
parsed_url.params,parsed_url.query, parsed_url.fragment))
[162] Fix | Delete
url = urllib.parse.quote(url)
[163] Fix | Delete
if not url:
[164] Fix | Delete
url = "/"
[165] Fix | Delete
for entry in self.entries:
[166] Fix | Delete
if entry.applies_to(useragent):
[167] Fix | Delete
return entry.allowance(url)
[168] Fix | Delete
# try the default entry last
[169] Fix | Delete
if self.default_entry:
[170] Fix | Delete
return self.default_entry.allowance(url)
[171] Fix | Delete
# agent not found ==> access granted
[172] Fix | Delete
return True
[173] Fix | Delete
[174] Fix | Delete
def crawl_delay(self, useragent):
[175] Fix | Delete
if not self.mtime():
[176] Fix | Delete
return None
[177] Fix | Delete
for entry in self.entries:
[178] Fix | Delete
if entry.applies_to(useragent):
[179] Fix | Delete
return entry.delay
[180] Fix | Delete
return self.default_entry.delay
[181] Fix | Delete
[182] Fix | Delete
def request_rate(self, useragent):
[183] Fix | Delete
if not self.mtime():
[184] Fix | Delete
return None
[185] Fix | Delete
for entry in self.entries:
[186] Fix | Delete
if entry.applies_to(useragent):
[187] Fix | Delete
return entry.req_rate
[188] Fix | Delete
return self.default_entry.req_rate
[189] Fix | Delete
[190] Fix | Delete
def __str__(self):
[191] Fix | Delete
entries = self.entries
[192] Fix | Delete
if self.default_entry is not None:
[193] Fix | Delete
entries = entries + [self.default_entry]
[194] Fix | Delete
return '\n'.join(map(str, entries)) + '\n'
[195] Fix | Delete
[196] Fix | Delete
[197] Fix | Delete
class RuleLine:
[198] Fix | Delete
"""A rule line is a single "Allow:" (allowance==True) or "Disallow:"
[199] Fix | Delete
(allowance==False) followed by a path."""
[200] Fix | Delete
def __init__(self, path, allowance):
[201] Fix | Delete
if path == '' and not allowance:
[202] Fix | Delete
# an empty value means allow all
[203] Fix | Delete
allowance = True
[204] Fix | Delete
path = urllib.parse.urlunparse(urllib.parse.urlparse(path))
[205] Fix | Delete
self.path = urllib.parse.quote(path)
[206] Fix | Delete
self.allowance = allowance
[207] Fix | Delete
[208] Fix | Delete
def applies_to(self, filename):
[209] Fix | Delete
return self.path == "*" or filename.startswith(self.path)
[210] Fix | Delete
[211] Fix | Delete
def __str__(self):
[212] Fix | Delete
return ("Allow" if self.allowance else "Disallow") + ": " + self.path
[213] Fix | Delete
[214] Fix | Delete
[215] Fix | Delete
class Entry:
[216] Fix | Delete
"""An entry has one or more user-agents and zero or more rulelines"""
[217] Fix | Delete
def __init__(self):
[218] Fix | Delete
self.useragents = []
[219] Fix | Delete
self.rulelines = []
[220] Fix | Delete
self.delay = None
[221] Fix | Delete
self.req_rate = None
[222] Fix | Delete
[223] Fix | Delete
def __str__(self):
[224] Fix | Delete
ret = []
[225] Fix | Delete
for agent in self.useragents:
[226] Fix | Delete
ret.append(f"User-agent: {agent}")
[227] Fix | Delete
if self.delay is not None:
[228] Fix | Delete
ret.append(f"Crawl-delay: {self.delay}")
[229] Fix | Delete
if self.req_rate is not None:
[230] Fix | Delete
rate = self.req_rate
[231] Fix | Delete
ret.append(f"Request-rate: {rate.requests}/{rate.seconds}")
[232] Fix | Delete
ret.extend(map(str, self.rulelines))
[233] Fix | Delete
ret.append('') # for compatibility
[234] Fix | Delete
return '\n'.join(ret)
[235] Fix | Delete
[236] Fix | Delete
def applies_to(self, useragent):
[237] Fix | Delete
"""check if this entry applies to the specified agent"""
[238] Fix | Delete
# split the name token and make it lower case
[239] Fix | Delete
useragent = useragent.split("/")[0].lower()
[240] Fix | Delete
for agent in self.useragents:
[241] Fix | Delete
if agent == '*':
[242] Fix | Delete
# we have the catch-all agent
[243] Fix | Delete
return True
[244] Fix | Delete
agent = agent.lower()
[245] Fix | Delete
if agent in useragent:
[246] Fix | Delete
return True
[247] Fix | Delete
return False
[248] Fix | Delete
[249] Fix | Delete
def allowance(self, filename):
[250] Fix | Delete
"""Preconditions:
[251] Fix | Delete
- our agent applies to this entry
[252] Fix | Delete
- filename is URL decoded"""
[253] Fix | Delete
for line in self.rulelines:
[254] Fix | Delete
if line.applies_to(filename):
[255] Fix | Delete
return line.allowance
[256] Fix | Delete
return True
[257] Fix | Delete
[258] Fix | Delete
It is recommended that you Edit text format, this type of Fix handles quite a lot in one request
Function