cpython/Lib/test/test_robotparser.py
Benjamin Peterson d63137159b Merged revisions 65209-65216,65225-65226,65233,65239,65246-65247,65255-65256 via svnmerge from
svn+ssh://pythondev@svn.python.org/python/trunk

........
  r65209 | raymond.hettinger | 2008-07-23 19:08:18 -0500 (Wed, 23 Jul 2008) | 1 line

  Finish-up the partial conversion from int to Py_ssize_t for deque indices and length.
........
  r65210 | raymond.hettinger | 2008-07-23 19:53:49 -0500 (Wed, 23 Jul 2008) | 1 line

  Parse to the correct datatype.
........
  r65211 | benjamin.peterson | 2008-07-23 21:27:46 -0500 (Wed, 23 Jul 2008) | 1 line

  fix spacing
........
  r65212 | benjamin.peterson | 2008-07-23 21:31:28 -0500 (Wed, 23 Jul 2008) | 1 line

  fix markup
........
  r65213 | benjamin.peterson | 2008-07-23 21:45:37 -0500 (Wed, 23 Jul 2008) | 1 line

  add some documentation for 2to3
........
  r65214 | raymond.hettinger | 2008-07-24 00:38:48 -0500 (Thu, 24 Jul 2008) | 1 line

  Finish conversion from int to Py_ssize_t.
........
  r65215 | raymond.hettinger | 2008-07-24 02:04:55 -0500 (Thu, 24 Jul 2008) | 1 line

  Convert from long to Py_ssize_t.
........
  r65216 | georg.brandl | 2008-07-24 02:09:21 -0500 (Thu, 24 Jul 2008) | 2 lines

  Fix indentation.
........
  r65225 | benjamin.peterson | 2008-07-25 11:55:37 -0500 (Fri, 25 Jul 2008) | 1 line

  teach .bzrignore about doc tools
........
  r65226 | benjamin.peterson | 2008-07-25 12:02:11 -0500 (Fri, 25 Jul 2008) | 1 line

  document default value for fillvalue
........
  r65233 | raymond.hettinger | 2008-07-25 13:43:33 -0500 (Fri, 25 Jul 2008) | 1 line

  Issue 1592:  Better error reporting for operations on closed shelves.
........
  r65239 | benjamin.peterson | 2008-07-25 16:59:53 -0500 (Fri, 25 Jul 2008) | 1 line

  fix indentation
........
  r65246 | andrew.kuchling | 2008-07-26 08:08:19 -0500 (Sat, 26 Jul 2008) | 1 line

  This sentence continues to bug me; rewrite it for the second time
........
  r65247 | andrew.kuchling | 2008-07-26 08:09:06 -0500 (Sat, 26 Jul 2008) | 1 line

  Remove extra words
........
  r65255 | skip.montanaro | 2008-07-26 19:49:02 -0500 (Sat, 26 Jul 2008) | 3 lines

  Close issue 3437 - missing state change when Allow lines are processed.
  Adds test cases which use Allow: as well.
........
  r65256 | skip.montanaro | 2008-07-26 19:50:41 -0500 (Sat, 26 Jul 2008) | 2 lines

  note robotparser bug fix.
........
2008-07-31 16:23:04 +00:00

236 lines
5.2 KiB
Python

import io
import unittest
import urllib.robotparser
from test import support
class RobotTestCase(unittest.TestCase):
def __init__(self, index, parser, url, good, agent):
unittest.TestCase.__init__(self)
if good:
self.str = "RobotTest(%d, good, %s)" % (index, url)
else:
self.str = "RobotTest(%d, bad, %s)" % (index, url)
self.parser = parser
self.url = url
self.good = good
self.agent = agent
def runTest(self):
if isinstance(self.url, tuple):
agent, url = self.url
else:
url = self.url
agent = self.agent
if self.good:
self.failUnless(self.parser.can_fetch(agent, url))
else:
self.failIf(self.parser.can_fetch(agent, url))
def __str__(self):
return self.str
tests = unittest.TestSuite()
def RobotTest(index, robots_txt, good_urls, bad_urls,
agent="test_robotparser"):
lines = io.StringIO(robots_txt).readlines()
parser = urllib.robotparser.RobotFileParser()
parser.parse(lines)
for url in good_urls:
tests.addTest(RobotTestCase(index, parser, url, 1, agent))
for url in bad_urls:
tests.addTest(RobotTestCase(index, parser, url, 0, agent))
# Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002)
# 1.
doc = """
User-agent: *
Disallow: /cyberworld/map/ # This is an infinite virtual URL space
Disallow: /tmp/ # these will soon disappear
Disallow: /foo.html
"""
good = ['/','/test.html']
bad = ['/cyberworld/map/index.html','/tmp/xxx','/foo.html']
RobotTest(1, doc, good, bad)
# 2.
doc = """
# robots.txt for http://www.example.com/
User-agent: *
Disallow: /cyberworld/map/ # This is an infinite virtual URL space
# Cybermapper knows where to go.
User-agent: cybermapper
Disallow:
"""
good = ['/','/test.html',('cybermapper','/cyberworld/map/index.html')]
bad = ['/cyberworld/map/index.html']
RobotTest(2, doc, good, bad)
# 3.
doc = """
# go away
User-agent: *
Disallow: /
"""
good = []
bad = ['/cyberworld/map/index.html','/','/tmp/']
RobotTest(3, doc, good, bad)
# Examples from http://www.robotstxt.org/wc/norobots-rfc.html (fetched 2002)
# 4.
doc = """
User-agent: figtree
Disallow: /tmp
Disallow: /a%3cd.html
Disallow: /a%2fb.html
Disallow: /%7ejoe/index.html
"""
good = [] # XFAIL '/a/b.html'
bad = ['/tmp','/tmp.html','/tmp/a.html',
'/a%3cd.html','/a%3Cd.html','/a%2fb.html',
'/~joe/index.html'
]
RobotTest(4, doc, good, bad, 'figtree')
RobotTest(5, doc, good, bad, 'FigTree Robot libwww-perl/5.04')
# 6.
doc = """
User-agent: *
Disallow: /tmp/
Disallow: /a%3Cd.html
Disallow: /a/b.html
Disallow: /%7ejoe/index.html
"""
good = ['/tmp',] # XFAIL: '/a%2fb.html'
bad = ['/tmp/','/tmp/a.html',
'/a%3cd.html','/a%3Cd.html',"/a/b.html",
'/%7Ejoe/index.html']
RobotTest(6, doc, good, bad)
# From bug report #523041
# 7.
doc = """
User-Agent: *
Disallow: /.
"""
good = ['/foo.html']
bad = [] # Bug report says "/" should be denied, but that is not in the RFC
RobotTest(7, doc, good, bad)
# From Google: http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=40364
# 8.
doc = """
User-agent: Googlebot
Allow: /folder1/myfile.html
Disallow: /folder1/
"""
good = ['/folder1/myfile.html']
bad = ['/folder1/anotherfile.html']
RobotTest(8, doc, good, bad, agent="Googlebot")
# 9. This file is incorrect because "Googlebot" is a substring of
# "Googlebot-Mobile", so test 10 works just like test 9.
doc = """
User-agent: Googlebot
Disallow: /
User-agent: Googlebot-Mobile
Allow: /
"""
good = []
bad = ['/something.jpg']
RobotTest(9, doc, good, bad, agent="Googlebot")
good = []
bad = ['/something.jpg']
RobotTest(10, doc, good, bad, agent="Googlebot-Mobile")
# 11. Get the order correct.
doc = """
User-agent: Googlebot-Mobile
Allow: /
User-agent: Googlebot
Disallow: /
"""
good = []
bad = ['/something.jpg']
RobotTest(11, doc, good, bad, agent="Googlebot")
good = ['/something.jpg']
bad = []
RobotTest(12, doc, good, bad, agent="Googlebot-Mobile")
# 13. Google also got the order wrong in #8. You need to specify the
# URLs from more specific to more general.
doc = """
User-agent: Googlebot
Allow: /folder1/myfile.html
Disallow: /folder1/
"""
good = ['/folder1/myfile.html']
bad = ['/folder1/anotherfile.html']
RobotTest(13, doc, good, bad, agent="googlebot")
class NetworkTestCase(unittest.TestCase):
def testPasswordProtectedSite(self):
if not support.is_resource_enabled('network'):
return
# whole site is password-protected.
url = 'http://mueblesmoraleda.com'
parser = urllib.robotparser.RobotFileParser()
parser.set_url(url)
parser.read()
self.assertEqual(parser.can_fetch("*", url+"/robots.txt"), False)
def testPythonOrg(self):
if not support.is_resource_enabled('network'):
return
parser = urllib.robotparser.RobotFileParser(
"http://www.python.org/robots.txt")
parser.read()
self.assertTrue(parser.can_fetch("*",
"http://www.python.org/robots.txt"))
def test_main():
support.run_unittest(NetworkTestCase)
support.run_unittest(tests)
if __name__=='__main__':
support.verbose = 1
test_main()