Edit File by line
/home/barbar84/public_h.../wp-conte.../plugins/sujqvwi/ShExBy/shex_roo.../usr/lib64/python2....
File: robotparser.py
""" robotparser.py
[0] Fix | Delete
[1] Fix | Delete
Copyright (C) 2000 Bastian Kleineidam
[2] Fix | Delete
[3] Fix | Delete
You can choose between two licenses when using this package:
[4] Fix | Delete
1) GNU GPLv2
[5] Fix | Delete
2) PSF license for Python 2.2
[6] Fix | Delete
[7] Fix | Delete
The robots.txt Exclusion Protocol is implemented as specified in
[8] Fix | Delete
http://www.robotstxt.org/norobots-rfc.txt
[9] Fix | Delete
[10] Fix | Delete
"""
[11] Fix | Delete
import urlparse
[12] Fix | Delete
import urllib
[13] Fix | Delete
[14] Fix | Delete
__all__ = ["RobotFileParser"]
[15] Fix | Delete
[16] Fix | Delete
[17] Fix | Delete
class RobotFileParser:
[18] Fix | Delete
""" This class provides a set of methods to read, parse and answer
[19] Fix | Delete
questions about a single robots.txt file.
[20] Fix | Delete
[21] Fix | Delete
"""
[22] Fix | Delete
[23] Fix | Delete
def __init__(self, url=''):
[24] Fix | Delete
self.entries = []
[25] Fix | Delete
self.default_entry = None
[26] Fix | Delete
self.disallow_all = False
[27] Fix | Delete
self.allow_all = False
[28] Fix | Delete
self.set_url(url)
[29] Fix | Delete
self.last_checked = 0
[30] Fix | Delete
[31] Fix | Delete
def mtime(self):
[32] Fix | Delete
"""Returns the time the robots.txt file was last fetched.
[33] Fix | Delete
[34] Fix | Delete
This is useful for long-running web spiders that need to
[35] Fix | Delete
check for new robots.txt files periodically.
[36] Fix | Delete
[37] Fix | Delete
"""
[38] Fix | Delete
return self.last_checked
[39] Fix | Delete
[40] Fix | Delete
def modified(self):
[41] Fix | Delete
"""Sets the time the robots.txt file was last fetched to the
[42] Fix | Delete
current time.
[43] Fix | Delete
[44] Fix | Delete
"""
[45] Fix | Delete
import time
[46] Fix | Delete
self.last_checked = time.time()
[47] Fix | Delete
[48] Fix | Delete
def set_url(self, url):
[49] Fix | Delete
"""Sets the URL referring to a robots.txt file."""
[50] Fix | Delete
self.url = url
[51] Fix | Delete
self.host, self.path = urlparse.urlparse(url)[1:3]
[52] Fix | Delete
[53] Fix | Delete
def read(self):
[54] Fix | Delete
"""Reads the robots.txt URL and feeds it to the parser."""
[55] Fix | Delete
opener = URLopener()
[56] Fix | Delete
f = opener.open(self.url)
[57] Fix | Delete
lines = [line.strip() for line in f]
[58] Fix | Delete
f.close()
[59] Fix | Delete
self.errcode = opener.errcode
[60] Fix | Delete
if self.errcode in (401, 403):
[61] Fix | Delete
self.disallow_all = True
[62] Fix | Delete
elif self.errcode >= 400 and self.errcode < 500:
[63] Fix | Delete
self.allow_all = True
[64] Fix | Delete
elif self.errcode == 200 and lines:
[65] Fix | Delete
self.parse(lines)
[66] Fix | Delete
[67] Fix | Delete
def _add_entry(self, entry):
[68] Fix | Delete
if "*" in entry.useragents:
[69] Fix | Delete
# the default entry is considered last
[70] Fix | Delete
if self.default_entry is None:
[71] Fix | Delete
# the first default entry wins
[72] Fix | Delete
self.default_entry = entry
[73] Fix | Delete
else:
[74] Fix | Delete
self.entries.append(entry)
[75] Fix | Delete
[76] Fix | Delete
def parse(self, lines):
[77] Fix | Delete
"""parse the input lines from a robots.txt file.
[78] Fix | Delete
We allow that a user-agent: line is not preceded by
[79] Fix | Delete
one or more blank lines."""
[80] Fix | Delete
# states:
[81] Fix | Delete
# 0: start state
[82] Fix | Delete
# 1: saw user-agent line
[83] Fix | Delete
# 2: saw an allow or disallow line
[84] Fix | Delete
state = 0
[85] Fix | Delete
linenumber = 0
[86] Fix | Delete
entry = Entry()
[87] Fix | Delete
[88] Fix | Delete
self.modified()
[89] Fix | Delete
for line in lines:
[90] Fix | Delete
linenumber += 1
[91] Fix | Delete
if not line:
[92] Fix | Delete
if state == 1:
[93] Fix | Delete
entry = Entry()
[94] Fix | Delete
state = 0
[95] Fix | Delete
elif state == 2:
[96] Fix | Delete
self._add_entry(entry)
[97] Fix | Delete
entry = Entry()
[98] Fix | Delete
state = 0
[99] Fix | Delete
# remove optional comment and strip line
[100] Fix | Delete
i = line.find('#')
[101] Fix | Delete
if i >= 0:
[102] Fix | Delete
line = line[:i]
[103] Fix | Delete
line = line.strip()
[104] Fix | Delete
if not line:
[105] Fix | Delete
continue
[106] Fix | Delete
line = line.split(':', 1)
[107] Fix | Delete
if len(line) == 2:
[108] Fix | Delete
line[0] = line[0].strip().lower()
[109] Fix | Delete
line[1] = urllib.unquote(line[1].strip())
[110] Fix | Delete
if line[0] == "user-agent":
[111] Fix | Delete
if state == 2:
[112] Fix | Delete
self._add_entry(entry)
[113] Fix | Delete
entry = Entry()
[114] Fix | Delete
entry.useragents.append(line[1])
[115] Fix | Delete
state = 1
[116] Fix | Delete
elif line[0] == "disallow":
[117] Fix | Delete
if state != 0:
[118] Fix | Delete
entry.rulelines.append(RuleLine(line[1], False))
[119] Fix | Delete
state = 2
[120] Fix | Delete
elif line[0] == "allow":
[121] Fix | Delete
if state != 0:
[122] Fix | Delete
entry.rulelines.append(RuleLine(line[1], True))
[123] Fix | Delete
state = 2
[124] Fix | Delete
if state == 2:
[125] Fix | Delete
self._add_entry(entry)
[126] Fix | Delete
[127] Fix | Delete
[128] Fix | Delete
def can_fetch(self, useragent, url):
[129] Fix | Delete
"""using the parsed robots.txt decide if useragent can fetch url"""
[130] Fix | Delete
if self.disallow_all:
[131] Fix | Delete
return False
[132] Fix | Delete
if self.allow_all:
[133] Fix | Delete
return True
[134] Fix | Delete
[135] Fix | Delete
# Until the robots.txt file has been read or found not
[136] Fix | Delete
# to exist, we must assume that no url is allowable.
[137] Fix | Delete
# This prevents false positives when a user erroneously
[138] Fix | Delete
# calls can_fetch() before calling read().
[139] Fix | Delete
if not self.last_checked:
[140] Fix | Delete
return False
[141] Fix | Delete
[142] Fix | Delete
# search for given user agent matches
[143] Fix | Delete
# the first match counts
[144] Fix | Delete
parsed_url = urlparse.urlparse(urllib.unquote(url))
[145] Fix | Delete
url = urlparse.urlunparse(('', '', parsed_url.path,
[146] Fix | Delete
parsed_url.params, parsed_url.query, parsed_url.fragment))
[147] Fix | Delete
url = urllib.quote(url)
[148] Fix | Delete
if not url:
[149] Fix | Delete
url = "/"
[150] Fix | Delete
for entry in self.entries:
[151] Fix | Delete
if entry.applies_to(useragent):
[152] Fix | Delete
return entry.allowance(url)
[153] Fix | Delete
# try the default entry last
[154] Fix | Delete
if self.default_entry:
[155] Fix | Delete
return self.default_entry.allowance(url)
[156] Fix | Delete
# agent not found ==> access granted
[157] Fix | Delete
return True
[158] Fix | Delete
[159] Fix | Delete
[160] Fix | Delete
def __str__(self):
[161] Fix | Delete
entries = self.entries
[162] Fix | Delete
if self.default_entry is not None:
[163] Fix | Delete
entries = entries + [self.default_entry]
[164] Fix | Delete
return '\n'.join(map(str, entries)) + '\n'
[165] Fix | Delete
[166] Fix | Delete
[167] Fix | Delete
class RuleLine:
[168] Fix | Delete
"""A rule line is a single "Allow:" (allowance==True) or "Disallow:"
[169] Fix | Delete
(allowance==False) followed by a path."""
[170] Fix | Delete
def __init__(self, path, allowance):
[171] Fix | Delete
if path == '' and not allowance:
[172] Fix | Delete
# an empty value means allow all
[173] Fix | Delete
allowance = True
[174] Fix | Delete
path = urlparse.urlunparse(urlparse.urlparse(path))
[175] Fix | Delete
self.path = urllib.quote(path)
[176] Fix | Delete
self.allowance = allowance
[177] Fix | Delete
[178] Fix | Delete
def applies_to(self, filename):
[179] Fix | Delete
return self.path == "*" or filename.startswith(self.path)
[180] Fix | Delete
[181] Fix | Delete
def __str__(self):
[182] Fix | Delete
return (self.allowance and "Allow" or "Disallow") + ": " + self.path
[183] Fix | Delete
[184] Fix | Delete
[185] Fix | Delete
class Entry:
[186] Fix | Delete
"""An entry has one or more user-agents and zero or more rulelines"""
[187] Fix | Delete
def __init__(self):
[188] Fix | Delete
self.useragents = []
[189] Fix | Delete
self.rulelines = []
[190] Fix | Delete
[191] Fix | Delete
def __str__(self):
[192] Fix | Delete
ret = []
[193] Fix | Delete
for agent in self.useragents:
[194] Fix | Delete
ret.extend(["User-agent: ", agent, "\n"])
[195] Fix | Delete
for line in self.rulelines:
[196] Fix | Delete
ret.extend([str(line), "\n"])
[197] Fix | Delete
return ''.join(ret)
[198] Fix | Delete
[199] Fix | Delete
def applies_to(self, useragent):
[200] Fix | Delete
"""check if this entry applies to the specified agent"""
[201] Fix | Delete
# split the name token and make it lower case
[202] Fix | Delete
useragent = useragent.split("/")[0].lower()
[203] Fix | Delete
for agent in self.useragents:
[204] Fix | Delete
if agent == '*':
[205] Fix | Delete
# we have the catch-all agent
[206] Fix | Delete
return True
[207] Fix | Delete
agent = agent.lower()
[208] Fix | Delete
if agent in useragent:
[209] Fix | Delete
return True
[210] Fix | Delete
return False
[211] Fix | Delete
[212] Fix | Delete
def allowance(self, filename):
[213] Fix | Delete
"""Preconditions:
[214] Fix | Delete
- our agent applies to this entry
[215] Fix | Delete
- filename is URL decoded"""
[216] Fix | Delete
for line in self.rulelines:
[217] Fix | Delete
if line.applies_to(filename):
[218] Fix | Delete
return line.allowance
[219] Fix | Delete
return True
[220] Fix | Delete
[221] Fix | Delete
class URLopener(urllib.FancyURLopener):
[222] Fix | Delete
def __init__(self, *args):
[223] Fix | Delete
urllib.FancyURLopener.__init__(self, *args)
[224] Fix | Delete
self.errcode = 200
[225] Fix | Delete
[226] Fix | Delete
def prompt_user_passwd(self, host, realm):
[227] Fix | Delete
## If robots.txt file is accessible only with a password,
[228] Fix | Delete
## we act as if the file wasn't there.
[229] Fix | Delete
return None, None
[230] Fix | Delete
[231] Fix | Delete
def http_error_default(self, url, fp, errcode, errmsg, headers):
[232] Fix | Delete
self.errcode = errcode
[233] Fix | Delete
return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,
[234] Fix | Delete
errmsg, headers)
[235] Fix | Delete
[236] Fix | Delete
It is recommended that you Edit text format, this type of Fix handles quite a lot in one request
Function