1997-01-30 11:18:23 +08:00
|
|
|
"""
|
|
|
|
|
|
|
|
Robots.txt file parser class. Accepts a list of lines or robots.txt URL as
|
|
|
|
input, builds a set of rules from that list, then answers questions about
|
|
|
|
fetchability of other URLs.
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
class RobotFileParser:
|
|
|
|
|
|
|
|
def __init__(self):
|
1998-04-06 22:29:28 +08:00
|
|
|
self.rules = {}
|
|
|
|
self.debug = 0
|
|
|
|
self.url = ''
|
|
|
|
self.last_checked = 0
|
1997-01-30 11:18:23 +08:00
|
|
|
|
|
|
|
def mtime(self):
|
1998-04-06 22:29:28 +08:00
|
|
|
return self.last_checked
|
1997-01-30 11:18:23 +08:00
|
|
|
|
|
|
|
def modified(self):
|
1998-04-06 22:29:28 +08:00
|
|
|
import time
|
|
|
|
self.last_checked = time.time()
|
1997-01-30 11:18:23 +08:00
|
|
|
|
|
|
|
def set_url(self, url):
|
1998-04-06 22:29:28 +08:00
|
|
|
self.url = url
|
1997-01-30 11:18:23 +08:00
|
|
|
|
|
|
|
def read(self):
|
1998-04-06 22:29:28 +08:00
|
|
|
import urllib
|
|
|
|
self.parse(urllib.urlopen(self.url).readlines())
|
1997-01-30 11:18:23 +08:00
|
|
|
|
|
|
|
def parse(self, lines):
|
2000-03-28 03:29:31 +08:00
|
|
|
"""parse the input lines from a robot.txt file"""
|
|
|
|
import string, re
|
1998-04-06 22:29:28 +08:00
|
|
|
active = []
|
|
|
|
for line in lines:
|
|
|
|
if self.debug: print '>', line,
|
|
|
|
# blank line terminates current record
|
|
|
|
if not line[:-1]:
|
|
|
|
active = []
|
|
|
|
continue
|
|
|
|
# remove optional comment and strip line
|
|
|
|
line = string.strip(line[:string.find(line, '#')])
|
|
|
|
if not line:
|
|
|
|
continue
|
2000-03-28 03:29:31 +08:00
|
|
|
line = re.split(' *: *', line)
|
1998-04-06 22:29:28 +08:00
|
|
|
if len(line) == 2:
|
|
|
|
line[0] = string.lower(line[0])
|
|
|
|
if line[0] == 'user-agent':
|
|
|
|
# this record applies to this user agent
|
|
|
|
if self.debug: print '>> user-agent:', line[1]
|
|
|
|
active.append(line[1])
|
|
|
|
if not self.rules.has_key(line[1]):
|
|
|
|
self.rules[line[1]] = []
|
|
|
|
elif line[0] == 'disallow':
|
|
|
|
if line[1]:
|
|
|
|
if self.debug: print '>> disallow:', line[1]
|
|
|
|
for agent in active:
|
2000-03-28 03:29:31 +08:00
|
|
|
self.rules[agent].append(re.compile(line[1]))
|
1998-04-06 22:29:28 +08:00
|
|
|
else:
|
|
|
|
pass
|
|
|
|
for agent in active:
|
|
|
|
if self.debug: print '>> allow', agent
|
|
|
|
self.rules[agent] = []
|
|
|
|
else:
|
|
|
|
if self.debug: print '>> unknown:', line
|
1997-01-30 11:18:23 +08:00
|
|
|
|
1998-04-06 22:29:28 +08:00
|
|
|
self.modified()
|
1997-01-30 11:18:23 +08:00
|
|
|
|
|
|
|
# returns true if agent is allowed to fetch url
|
2000-03-28 03:29:31 +08:00
|
|
|
def can_fetch(self, useragent, url):
|
|
|
|
"""using the parsed robots.txt decide if useragent can fetch url"""
|
1998-04-06 22:29:28 +08:00
|
|
|
import urlparse
|
2000-03-28 03:29:31 +08:00
|
|
|
ag = useragent
|
1998-04-06 22:29:28 +08:00
|
|
|
if not self.rules.has_key(ag): ag = '*'
|
|
|
|
if not self.rules.has_key(ag):
|
2000-03-28 03:29:31 +08:00
|
|
|
if self.debug: print '>> allowing', url, 'fetch by', useragent
|
1998-04-06 22:29:28 +08:00
|
|
|
return 1
|
|
|
|
path = urlparse.urlparse(url)[2]
|
|
|
|
for rule in self.rules[ag]:
|
2000-03-28 03:29:31 +08:00
|
|
|
if rule.match(path) is not None:
|
|
|
|
if self.debug: print '>> disallowing', url, 'fetch by', useragent
|
1998-04-06 22:29:28 +08:00
|
|
|
return 0
|
2000-03-28 03:29:31 +08:00
|
|
|
if self.debug: print '>> allowing', url, 'fetch by', useragent
|
1998-04-06 22:29:28 +08:00
|
|
|
return 1
|
1997-01-30 11:18:23 +08:00
|
|
|
|
2000-03-28 03:29:31 +08:00
|
|
|
def _test():
|
1997-01-30 11:18:23 +08:00
|
|
|
rp = RobotFileParser()
|
|
|
|
rp.debug = 1
|
2000-03-28 03:29:31 +08:00
|
|
|
rp.set_url('http://www.musi-cal.com/robots.txt')
|
1997-01-30 11:18:23 +08:00
|
|
|
rp.read()
|
|
|
|
print rp.rules
|
2000-03-28 03:29:31 +08:00
|
|
|
print rp.can_fetch('*', 'http://www.musi-cal.com.com/')
|
1997-01-30 11:18:23 +08:00
|
|
|
print rp.can_fetch('Musi-Cal-Robot',
|
2000-03-28 03:29:31 +08:00
|
|
|
'http://www.musi-cal.com/cgi-bin/event-search?city=San+Francisco')
|
1997-01-30 11:18:23 +08:00
|
|
|
|
2000-03-28 03:29:31 +08:00
|
|
|
if __name__ == "__main__":
|
|
|
|
_test()
|