Edit File by line

""" robotparser.py

[0] Fix | Delete

[1] Fix | Delete

[2] Fix | Delete

[3] Fix | Delete

You can choose between two licenses when using this package:

[4] Fix | Delete

1) GNU GPLv2

[5] Fix | Delete

2) PSF license for Python 2.2

[6] Fix | Delete

[7] Fix | Delete

The robots.txt Exclusion Protocol is implemented as specified in

[8] Fix | Delete

http://www.robotstxt.org/norobots-rfc.txt

[9] Fix | Delete

[10] Fix | Delete

"""

[11] Fix | Delete

import urlparse

[12] Fix | Delete

import urllib

[13] Fix | Delete

[14] Fix | Delete

__all__ = ["RobotFileParser"]

[15] Fix | Delete

[16] Fix | Delete

[17] Fix | Delete

class RobotFileParser:

[18] Fix | Delete

""" This class provides a set of methods to read, parse and answer

[19] Fix | Delete

questions about a single robots.txt file.

[20] Fix | Delete

[21] Fix | Delete

"""

[22] Fix | Delete

[23] Fix | Delete

def __init__(self, url=''):

[24] Fix | Delete

self.entries = []

[25] Fix | Delete

self.default_entry = None

[26] Fix | Delete

self.disallow_all = False

[27] Fix | Delete

self.allow_all = False

[28] Fix | Delete

self.set_url(url)

[29] Fix | Delete

self.last_checked = 0

[30] Fix | Delete

[31] Fix | Delete

def mtime(self):

[32] Fix | Delete

"""Returns the time the robots.txt file was last fetched.

[33] Fix | Delete

[34] Fix | Delete

This is useful for long-running web spiders that need to

[35] Fix | Delete

check for new robots.txt files periodically.

[36] Fix | Delete

[37] Fix | Delete

"""

[38] Fix | Delete

return self.last_checked

[39] Fix | Delete

[40] Fix | Delete

def modified(self):

[41] Fix | Delete

"""Sets the time the robots.txt file was last fetched to the

[42] Fix | Delete

current time.

[43] Fix | Delete

[44] Fix | Delete

"""

[45] Fix | Delete

import time

[46] Fix | Delete

self.last_checked = time.time()

[47] Fix | Delete

[48] Fix | Delete

def set_url(self, url):

[49] Fix | Delete

"""Sets the URL referring to a robots.txt file."""

[50] Fix | Delete

self.url = url

[51] Fix | Delete

self.host, self.path = urlparse.urlparse(url)[1:3]

[52] Fix | Delete

[53] Fix | Delete

def read(self):

[54] Fix | Delete

"""Reads the robots.txt URL and feeds it to the parser."""

[55] Fix | Delete

opener = URLopener()

[56] Fix | Delete

f = opener.open(self.url)

[57] Fix | Delete

lines = [line.strip() for line in f]

[58] Fix | Delete

f.close()

[59] Fix | Delete

self.errcode = opener.errcode

[60] Fix | Delete

if self.errcode in (401, 403):

[61] Fix | Delete

self.disallow_all = True

[62] Fix | Delete

elif self.errcode >= 400 and self.errcode < 500:

[63] Fix | Delete

self.allow_all = True

[64] Fix | Delete

elif self.errcode == 200 and lines:

[65] Fix | Delete

self.parse(lines)

[66] Fix | Delete

[67] Fix | Delete

def _add_entry(self, entry):

[68] Fix | Delete

if "*" in entry.useragents:

[69] Fix | Delete

# the default entry is considered last

[70] Fix | Delete

if self.default_entry is None:

[71] Fix | Delete

# the first default entry wins

[72] Fix | Delete

self.default_entry = entry

[73] Fix | Delete

else:

[74] Fix | Delete

self.entries.append(entry)

[75] Fix | Delete

[76] Fix | Delete

def parse(self, lines):

[77] Fix | Delete

"""parse the input lines from a robots.txt file.

[78] Fix | Delete

We allow that a user-agent: line is not preceded by

[79] Fix | Delete

one or more blank lines."""

[80] Fix | Delete

# states:

[81] Fix | Delete

# 0: start state

[82] Fix | Delete

# 1: saw user-agent line

[83] Fix | Delete

# 2: saw an allow or disallow line

[84] Fix | Delete

state = 0

[85] Fix | Delete

linenumber = 0

[86] Fix | Delete

entry = Entry()

[87] Fix | Delete

[88] Fix | Delete

self.modified()

[89] Fix | Delete

for line in lines:

[90] Fix | Delete

linenumber += 1

[91] Fix | Delete

if not line:

[92] Fix | Delete

if state == 1:

[93] Fix | Delete

entry = Entry()

[94] Fix | Delete

state = 0

[95] Fix | Delete

elif state == 2:

[96] Fix | Delete

self._add_entry(entry)

[97] Fix | Delete

entry = Entry()

[98] Fix | Delete

state = 0

[99] Fix | Delete

# remove optional comment and strip line

[100] Fix | Delete

i = line.find('#')

[101] Fix | Delete

if i >= 0:

[102] Fix | Delete

line = line[:i]

[103] Fix | Delete

line = line.strip()

[104] Fix | Delete

if not line:

[105] Fix | Delete

continue

[106] Fix | Delete

line = line.split(':', 1)

[107] Fix | Delete

if len(line) == 2:

[108] Fix | Delete

line[0] = line[0].strip().lower()

[109] Fix | Delete

line[1] = urllib.unquote(line[1].strip())

[110] Fix | Delete

if line[0] == "user-agent":

[111] Fix | Delete

if state == 2:

[112] Fix | Delete

self._add_entry(entry)

[113] Fix | Delete

entry = Entry()

[114] Fix | Delete

entry.useragents.append(line[1])

[115] Fix | Delete

state = 1

[116] Fix | Delete

elif line[0] == "disallow":

[117] Fix | Delete

if state != 0:

[118] Fix | Delete

entry.rulelines.append(RuleLine(line[1], False))

[119] Fix | Delete

state = 2

[120] Fix | Delete

elif line[0] == "allow":

[121] Fix | Delete

if state != 0:

[122] Fix | Delete

entry.rulelines.append(RuleLine(line[1], True))

[123] Fix | Delete

state = 2

[124] Fix | Delete

if state == 2:

[125] Fix | Delete

self._add_entry(entry)

[126] Fix | Delete

[127] Fix | Delete

[128] Fix | Delete

def can_fetch(self, useragent, url):

[129] Fix | Delete

"""using the parsed robots.txt decide if useragent can fetch url"""

[130] Fix | Delete

if self.disallow_all:

[131] Fix | Delete

return False

[132] Fix | Delete

if self.allow_all:

[133] Fix | Delete

return True

[134] Fix | Delete

[135] Fix | Delete

# Until the robots.txt file has been read or found not

[136] Fix | Delete

# to exist, we must assume that no url is allowable.

[137] Fix | Delete

# This prevents false positives when a user erroneously

[138] Fix | Delete

# calls can_fetch() before calling read().

[139] Fix | Delete

if not self.last_checked:

[140] Fix | Delete

return False

[141] Fix | Delete

[142] Fix | Delete

# search for given user agent matches

[143] Fix | Delete

# the first match counts

[144] Fix | Delete

parsed_url = urlparse.urlparse(urllib.unquote(url))

[145] Fix | Delete

url = urlparse.urlunparse(('', '', parsed_url.path,

[146] Fix | Delete

parsed_url.params, parsed_url.query, parsed_url.fragment))

[147] Fix | Delete

url = urllib.quote(url)

[148] Fix | Delete

if not url:

[149] Fix | Delete

url = "/"

[150] Fix | Delete

for entry in self.entries:

[151] Fix | Delete

if entry.applies_to(useragent):

[152] Fix | Delete

return entry.allowance(url)

[153] Fix | Delete

# try the default entry last

[154] Fix | Delete

if self.default_entry:

[155] Fix | Delete

return self.default_entry.allowance(url)

[156] Fix | Delete

# agent not found ==> access granted

[157] Fix | Delete

return True

[158] Fix | Delete

[159] Fix | Delete

[160] Fix | Delete

def __str__(self):

[161] Fix | Delete

return ''.join([str(entry) + "\n" for entry in self.entries])

[162] Fix | Delete

[163] Fix | Delete

[164] Fix | Delete

class RuleLine:

[165] Fix | Delete

"""A rule line is a single "Allow:" (allowance==True) or "Disallow:"

[166] Fix | Delete

(allowance==False) followed by a path."""

[167] Fix | Delete

def __init__(self, path, allowance):

[168] Fix | Delete

if path == '' and not allowance:

[169] Fix | Delete

# an empty value means allow all

[170] Fix | Delete

allowance = True

[171] Fix | Delete

path = urlparse.urlunparse(urlparse.urlparse(path))

[172] Fix | Delete

self.path = urllib.quote(path)

[173] Fix | Delete

self.allowance = allowance

[174] Fix | Delete

[175] Fix | Delete

def applies_to(self, filename):

[176] Fix | Delete

return self.path == "*" or filename.startswith(self.path)

[177] Fix | Delete

[178] Fix | Delete

def __str__(self):

[179] Fix | Delete

return (self.allowance and "Allow" or "Disallow") + ": " + self.path

[180] Fix | Delete

[181] Fix | Delete

[182] Fix | Delete

class Entry:

[183] Fix | Delete

"""An entry has one or more user-agents and zero or more rulelines"""

[184] Fix | Delete

def __init__(self):

[185] Fix | Delete

self.useragents = []

[186] Fix | Delete

self.rulelines = []

[187] Fix | Delete

[188] Fix | Delete

def __str__(self):

[189] Fix | Delete

ret = []

[190] Fix | Delete

for agent in self.useragents:

[191] Fix | Delete

ret.extend(["User-agent: ", agent, "\n"])

[192] Fix | Delete

for line in self.rulelines:

[193] Fix | Delete

ret.extend([str(line), "\n"])

[194] Fix | Delete

return ''.join(ret)

[195] Fix | Delete

[196] Fix | Delete

def applies_to(self, useragent):

[197] Fix | Delete

"""check if this entry applies to the specified agent"""

[198] Fix | Delete

# split the name token and make it lower case

[199] Fix | Delete

useragent = useragent.split("/")[0].lower()

[200] Fix | Delete

for agent in self.useragents:

[201] Fix | Delete

if agent == '*':

[202] Fix | Delete

# we have the catch-all agent

[203] Fix | Delete

return True

[204] Fix | Delete

agent = agent.lower()

[205] Fix | Delete

if agent in useragent:

[206] Fix | Delete

return True

[207] Fix | Delete

return False

[208] Fix | Delete

[209] Fix | Delete

def allowance(self, filename):

[210] Fix | Delete

"""Preconditions:

[211] Fix | Delete

- our agent applies to this entry

[212] Fix | Delete

- filename is URL decoded"""

[213] Fix | Delete

for line in self.rulelines:

[214] Fix | Delete

if line.applies_to(filename):

[215] Fix | Delete

return line.allowance

[216] Fix | Delete

return True

[217] Fix | Delete

[218] Fix | Delete

class URLopener(urllib.FancyURLopener):

[219] Fix | Delete

def __init__(self, *args):

[220] Fix | Delete

urllib.FancyURLopener.__init__(self, *args)

[221] Fix | Delete

self.errcode = 200

[222] Fix | Delete

[223] Fix | Delete

def prompt_user_passwd(self, host, realm):

[224] Fix | Delete

## If robots.txt file is accessible only with a password,

[225] Fix | Delete

## we act as if the file wasn't there.

[226] Fix | Delete

return None, None

[227] Fix | Delete

[228] Fix | Delete

def http_error_default(self, url, fp, errcode, errmsg, headers):

[229] Fix | Delete

self.errcode = errcode

[230] Fix | Delete

return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,

[231] Fix | Delete

errmsg, headers)

[232] Fix | Delete

[233] Fix | Delete