Edit File by line

""" robotparser.py

[0] Fix | Delete

[1] Fix | Delete

[2] Fix | Delete

[3] Fix | Delete

You can choose between two licenses when using this package:

[4] Fix | Delete

1) GNU GPLv2

[5] Fix | Delete

2) PSF license for Python 2.2

[6] Fix | Delete

[7] Fix | Delete

The robots.txt Exclusion Protocol is implemented as specified in

[8] Fix | Delete

http://www.robotstxt.org/norobots-rfc.txt

[9] Fix | Delete

"""

[10] Fix | Delete

[11] Fix | Delete

import collections

[12] Fix | Delete

import urllib.parse

[13] Fix | Delete

import urllib.request

[14] Fix | Delete

[15] Fix | Delete

__all__ = ["RobotFileParser"]

[16] Fix | Delete

[17] Fix | Delete

RequestRate = collections.namedtuple("RequestRate", "requests seconds")

[18] Fix | Delete

[19] Fix | Delete

[20] Fix | Delete

class RobotFileParser:

[21] Fix | Delete

""" This class provides a set of methods to read, parse and answer

[22] Fix | Delete

questions about a single robots.txt file.

[23] Fix | Delete

[24] Fix | Delete

"""

[25] Fix | Delete

[26] Fix | Delete

def __init__(self, url=''):

[27] Fix | Delete

self.entries = []

[28] Fix | Delete

self.default_entry = None

[29] Fix | Delete

self.disallow_all = False

[30] Fix | Delete

self.allow_all = False

[31] Fix | Delete

self.set_url(url)

[32] Fix | Delete

self.last_checked = 0

[33] Fix | Delete

[34] Fix | Delete

def mtime(self):

[35] Fix | Delete

"""Returns the time the robots.txt file was last fetched.

[36] Fix | Delete

[37] Fix | Delete

This is useful for long-running web spiders that need to

[38] Fix | Delete

check for new robots.txt files periodically.

[39] Fix | Delete

[40] Fix | Delete

"""

[41] Fix | Delete

return self.last_checked

[42] Fix | Delete

[43] Fix | Delete

def modified(self):

[44] Fix | Delete

"""Sets the time the robots.txt file was last fetched to the

[45] Fix | Delete

current time.

[46] Fix | Delete

[47] Fix | Delete

"""

[48] Fix | Delete

import time

[49] Fix | Delete

self.last_checked = time.time()

[50] Fix | Delete

[51] Fix | Delete

def set_url(self, url):

[52] Fix | Delete

"""Sets the URL referring to a robots.txt file."""

[53] Fix | Delete

self.url = url

[54] Fix | Delete

self.host, self.path = urllib.parse.urlparse(url)[1:3]

[55] Fix | Delete

[56] Fix | Delete

def read(self):

[57] Fix | Delete

"""Reads the robots.txt URL and feeds it to the parser."""

[58] Fix | Delete

try:

[59] Fix | Delete

f = urllib.request.urlopen(self.url)

[60] Fix | Delete

except urllib.error.HTTPError as err:

[61] Fix | Delete

if err.code in (401, 403):

[62] Fix | Delete

self.disallow_all = True

[63] Fix | Delete

elif err.code >= 400 and err.code < 500:

[64] Fix | Delete

self.allow_all = True

[65] Fix | Delete

else:

[66] Fix | Delete

raw = f.read()

[67] Fix | Delete

self.parse(raw.decode("utf-8").splitlines())

[68] Fix | Delete

[69] Fix | Delete

def _add_entry(self, entry):

[70] Fix | Delete

if "*" in entry.useragents:

[71] Fix | Delete

# the default entry is considered last

[72] Fix | Delete

if self.default_entry is None:

[73] Fix | Delete

# the first default entry wins

[74] Fix | Delete

self.default_entry = entry

[75] Fix | Delete

else:

[76] Fix | Delete

self.entries.append(entry)

[77] Fix | Delete

[78] Fix | Delete

def parse(self, lines):

[79] Fix | Delete

"""Parse the input lines from a robots.txt file.

[80] Fix | Delete

[81] Fix | Delete

We allow that a user-agent: line is not preceded by

[82] Fix | Delete

one or more blank lines.

[83] Fix | Delete

"""

[84] Fix | Delete

# states:

[85] Fix | Delete

# 0: start state

[86] Fix | Delete

# 1: saw user-agent line

[87] Fix | Delete

# 2: saw an allow or disallow line

[88] Fix | Delete

state = 0

[89] Fix | Delete

entry = Entry()

[90] Fix | Delete

[91] Fix | Delete

self.modified()

[92] Fix | Delete

for line in lines:

[93] Fix | Delete

if not line:

[94] Fix | Delete

if state == 1:

[95] Fix | Delete

entry = Entry()

[96] Fix | Delete

state = 0

[97] Fix | Delete

elif state == 2:

[98] Fix | Delete

self._add_entry(entry)

[99] Fix | Delete

entry = Entry()

[100] Fix | Delete

state = 0

[101] Fix | Delete

# remove optional comment and strip line

[102] Fix | Delete

i = line.find('#')

[103] Fix | Delete

if i >= 0:

[104] Fix | Delete

line = line[:i]

[105] Fix | Delete

line = line.strip()

[106] Fix | Delete

if not line:

[107] Fix | Delete

continue

[108] Fix | Delete

line = line.split(':', 1)

[109] Fix | Delete

if len(line) == 2:

[110] Fix | Delete

line[0] = line[0].strip().lower()

[111] Fix | Delete

line[1] = urllib.parse.unquote(line[1].strip())

[112] Fix | Delete

if line[0] == "user-agent":

[113] Fix | Delete

if state == 2:

[114] Fix | Delete

self._add_entry(entry)

[115] Fix | Delete

entry = Entry()

[116] Fix | Delete

entry.useragents.append(line[1])

[117] Fix | Delete

state = 1

[118] Fix | Delete

elif line[0] == "disallow":

[119] Fix | Delete

if state != 0:

[120] Fix | Delete

entry.rulelines.append(RuleLine(line[1], False))

[121] Fix | Delete

state = 2

[122] Fix | Delete

elif line[0] == "allow":

[123] Fix | Delete

if state != 0:

[124] Fix | Delete

entry.rulelines.append(RuleLine(line[1], True))

[125] Fix | Delete

state = 2

[126] Fix | Delete

elif line[0] == "crawl-delay":

[127] Fix | Delete

if state != 0:

[128] Fix | Delete

# before trying to convert to int we need to make

[129] Fix | Delete

# sure that robots.txt has valid syntax otherwise

[130] Fix | Delete

# it will crash

[131] Fix | Delete

if line[1].strip().isdigit():

[132] Fix | Delete

entry.delay = int(line[1])

[133] Fix | Delete

state = 2

[134] Fix | Delete

elif line[0] == "request-rate":

[135] Fix | Delete

if state != 0:

[136] Fix | Delete

numbers = line[1].split('/')

[137] Fix | Delete

# check if all values are sane

[138] Fix | Delete

if (len(numbers) == 2 and numbers[0].strip().isdigit()

[139] Fix | Delete

and numbers[1].strip().isdigit()):

[140] Fix | Delete

entry.req_rate = RequestRate(int(numbers[0]), int(numbers[1]))

[141] Fix | Delete

state = 2

[142] Fix | Delete

if state == 2:

[143] Fix | Delete

self._add_entry(entry)

[144] Fix | Delete

[145] Fix | Delete

def can_fetch(self, useragent, url):

[146] Fix | Delete

"""using the parsed robots.txt decide if useragent can fetch url"""

[147] Fix | Delete

if self.disallow_all:

[148] Fix | Delete

return False

[149] Fix | Delete

if self.allow_all:

[150] Fix | Delete

return True

[151] Fix | Delete

# Until the robots.txt file has been read or found not

[152] Fix | Delete

# to exist, we must assume that no url is allowable.

[153] Fix | Delete

# This prevents false positives when a user erroneously

[154] Fix | Delete

# calls can_fetch() before calling read().

[155] Fix | Delete

if not self.last_checked:

[156] Fix | Delete

return False

[157] Fix | Delete

# search for given user agent matches

[158] Fix | Delete

# the first match counts

[159] Fix | Delete

parsed_url = urllib.parse.urlparse(urllib.parse.unquote(url))

[160] Fix | Delete

url = urllib.parse.urlunparse(('','',parsed_url.path,

[161] Fix | Delete

parsed_url.params,parsed_url.query, parsed_url.fragment))

[162] Fix | Delete

url = urllib.parse.quote(url)

[163] Fix | Delete

if not url:

[164] Fix | Delete

url = "/"

[165] Fix | Delete

for entry in self.entries:

[166] Fix | Delete

if entry.applies_to(useragent):

[167] Fix | Delete

return entry.allowance(url)

[168] Fix | Delete

# try the default entry last

[169] Fix | Delete

if self.default_entry:

[170] Fix | Delete

return self.default_entry.allowance(url)

[171] Fix | Delete

# agent not found ==> access granted

[172] Fix | Delete

return True

[173] Fix | Delete

[174] Fix | Delete

def crawl_delay(self, useragent):

[175] Fix | Delete

if not self.mtime():

[176] Fix | Delete

return None

[177] Fix | Delete

for entry in self.entries:

[178] Fix | Delete

if entry.applies_to(useragent):

[179] Fix | Delete

return entry.delay

[180] Fix | Delete

return self.default_entry.delay

[181] Fix | Delete

[182] Fix | Delete

def request_rate(self, useragent):

[183] Fix | Delete

if not self.mtime():

[184] Fix | Delete

return None

[185] Fix | Delete

for entry in self.entries:

[186] Fix | Delete

if entry.applies_to(useragent):

[187] Fix | Delete

return entry.req_rate

[188] Fix | Delete

return self.default_entry.req_rate

[189] Fix | Delete

[190] Fix | Delete

def __str__(self):

[191] Fix | Delete

entries = self.entries

[192] Fix | Delete

if self.default_entry is not None:

[193] Fix | Delete

entries = entries + [self.default_entry]

[194] Fix | Delete

return '\n'.join(map(str, entries)) + '\n'

[195] Fix | Delete

[196] Fix | Delete

[197] Fix | Delete

class RuleLine:

[198] Fix | Delete

"""A rule line is a single "Allow:" (allowance==True) or "Disallow:"

[199] Fix | Delete

(allowance==False) followed by a path."""

[200] Fix | Delete

def __init__(self, path, allowance):

[201] Fix | Delete

if path == '' and not allowance:

[202] Fix | Delete

# an empty value means allow all

[203] Fix | Delete

allowance = True

[204] Fix | Delete

path = urllib.parse.urlunparse(urllib.parse.urlparse(path))

[205] Fix | Delete

self.path = urllib.parse.quote(path)

[206] Fix | Delete

self.allowance = allowance

[207] Fix | Delete

[208] Fix | Delete

def applies_to(self, filename):

[209] Fix | Delete

return self.path == "*" or filename.startswith(self.path)

[210] Fix | Delete

[211] Fix | Delete

def __str__(self):

[212] Fix | Delete

return ("Allow" if self.allowance else "Disallow") + ": " + self.path

[213] Fix | Delete

[214] Fix | Delete

[215] Fix | Delete

class Entry:

[216] Fix | Delete

"""An entry has one or more user-agents and zero or more rulelines"""

[217] Fix | Delete

def __init__(self):

[218] Fix | Delete

self.useragents = []

[219] Fix | Delete

self.rulelines = []

[220] Fix | Delete

self.delay = None

[221] Fix | Delete

self.req_rate = None

[222] Fix | Delete

[223] Fix | Delete

def __str__(self):

[224] Fix | Delete

ret = []

[225] Fix | Delete

for agent in self.useragents:

[226] Fix | Delete

ret.append(f"User-agent: {agent}")

[227] Fix | Delete

if self.delay is not None:

[228] Fix | Delete

ret.append(f"Crawl-delay: {self.delay}")

[229] Fix | Delete

if self.req_rate is not None:

[230] Fix | Delete

rate = self.req_rate

[231] Fix | Delete

ret.append(f"Request-rate: {rate.requests}/{rate.seconds}")

[232] Fix | Delete

ret.extend(map(str, self.rulelines))

[233] Fix | Delete

ret.append('') # for compatibility

[234] Fix | Delete

return '\n'.join(ret)

[235] Fix | Delete

[236] Fix | Delete

def applies_to(self, useragent):

[237] Fix | Delete

"""check if this entry applies to the specified agent"""

[238] Fix | Delete

# split the name token and make it lower case

[239] Fix | Delete

useragent = useragent.split("/")[0].lower()

[240] Fix | Delete

for agent in self.useragents:

[241] Fix | Delete

if agent == '*':

[242] Fix | Delete

# we have the catch-all agent

[243] Fix | Delete

return True

[244] Fix | Delete

agent = agent.lower()

[245] Fix | Delete

if agent in useragent:

[246] Fix | Delete

return True

[247] Fix | Delete

return False

[248] Fix | Delete

[249] Fix | Delete

def allowance(self, filename):

[250] Fix | Delete

"""Preconditions:

[251] Fix | Delete

- our agent applies to this entry

[252] Fix | Delete

- filename is URL decoded"""

[253] Fix | Delete

for line in self.rulelines:

[254] Fix | Delete

if line.applies_to(filename):

[255] Fix | Delete

return line.allowance

[256] Fix | Delete

return True

[257] Fix | Delete

[258] Fix | Delete