cpython/Tools/webchecker/robotparser.py

"""

Robots.txt file parser class.  Accepts a list of lines or robots.txt URL as
input, builds a set of rules from that list, then answers questions about
fetchability of other URLs.

"""

class RobotFileParser:

    def __init__(self):
        self.rules = {}
        self.debug = 0
        self.url = ''
        self.last_checked = 0

    def mtime(self):
        return self.last_checked

    def modified(self):
        import time
        self.last_checked = time.time()

    def set_url(self, url):
        self.url = url
##      import urlmisc
##      self.url = urlmisc.canonical_url(url)

    def read(self):
        import urllib
        self.parse(urllib.urlopen(self.url).readlines())

    def parse(self, lines):
        import regsub, string, regex
        active = []
        for line in lines:
            if self.debug: print '>', line,
            # blank line terminates current record
            if not line[:-1]:
                active = []
                continue
            # remove optional comment and strip line
            line = string.strip(line[:string.find(line, '#')])
            if not line:
                continue
            line = regsub.split(line, ' *: *')
            if len(line) == 2:
                line[0] = string.lower(line[0])
                if line[0] == 'user-agent':
                    # this record applies to this user agent
                    if self.debug: print '>> user-agent:', line[1]
                    active.append(line[1])
                    if not self.rules.has_key(line[1]):
                        self.rules[line[1]] = []
                elif line[0] == 'disallow':
                    if line[1]:
                        if self.debug: print '>> disallow:', line[1]
                        for agent in active:
                            self.rules[agent].append(regex.compile(line[1]))
                    else:
                        pass
                        for agent in active:
                            if self.debug: print '>> allow', agent
                            self.rules[agent] = []
                else:
                    if self.debug: print '>> unknown:', line

        self.modified()

    # returns true if agent is allowed to fetch url
    def can_fetch(self, agent, url):
        import urlparse
        ag = agent
        if not self.rules.has_key(ag): ag = '*'
        if not self.rules.has_key(ag):
            if self.debug: print '>> allowing', url, 'fetch by', agent
            return 1
        path = urlparse.urlparse(url)[2]
        for rule in self.rules[ag]:
            if rule.match(path) != -1:
                if self.debug: print '>> disallowing', url, 'fetch by', agent
                return 0
        if self.debug: print '>> allowing', url, 'fetch by', agent
        return 1

def test():
    rp = RobotFileParser()
    rp.debug = 1
    rp.set_url('http://www.automatrix.com/robots.txt')
    rp.read()
    print rp.rules
    print rp.can_fetch('*', 'http://www.calendar.com/concerts/')
    print rp.can_fetch('Musi-Cal-Robot',
                       'http://dolphin:80/cgi-bin/music-search?performer=Rolling+Stones')

    print rp.can_fetch('Lycos', 'http://www/~skip/volkswagen/')
    print rp.can_fetch('Lycos', 'http://www/~skip/volkswagen/vanagon-list-001')
Skip Montanaro's robots.txt parser. 1997-01-30 03:18:23 +00:00			`"""`

			`Robots.txt file parser class. Accepts a list of lines or robots.txt URL as`
			`input, builds a set of rules from that list, then answers questions about`
			`fetchability of other URLs.`

			`"""`

			`class RobotFileParser:`

			`def __init__(self):`
Give in to tabnanny 1998-04-06 14:29:28 +00:00			`self.rules = {}`
			`self.debug = 0`
			`self.url = ''`
			`self.last_checked = 0`
Skip Montanaro's robots.txt parser. 1997-01-30 03:18:23 +00:00
			`def mtime(self):`
Give in to tabnanny 1998-04-06 14:29:28 +00:00			`return self.last_checked`
Skip Montanaro's robots.txt parser. 1997-01-30 03:18:23 +00:00
			`def modified(self):`
Give in to tabnanny 1998-04-06 14:29:28 +00:00			`import time`
			`self.last_checked = time.time()`
Skip Montanaro's robots.txt parser. 1997-01-30 03:18:23 +00:00
			`def set_url(self, url):`
Give in to tabnanny 1998-04-06 14:29:28 +00:00			`self.url = url`
			`## import urlmisc`
			`## self.url = urlmisc.canonical_url(url)`
Skip Montanaro's robots.txt parser. 1997-01-30 03:18:23 +00:00
			`def read(self):`
Give in to tabnanny 1998-04-06 14:29:28 +00:00			`import urllib`
			`self.parse(urllib.urlopen(self.url).readlines())`
Skip Montanaro's robots.txt parser. 1997-01-30 03:18:23 +00:00
			`def parse(self, lines):`
Give in to tabnanny 1998-04-06 14:29:28 +00:00			`import regsub, string, regex`
			`active = []`
			`for line in lines:`
			`if self.debug: print '>', line,`
			`# blank line terminates current record`
			`if not line[:-1]:`
			`active = []`
			`continue`
			`# remove optional comment and strip line`
			`line = string.strip(line[:string.find(line, '#')])`
			`if not line:`
			`continue`
			`line = regsub.split(line, ' : ')`
			`if len(line) == 2:`
			`line[0] = string.lower(line[0])`
			`if line[0] == 'user-agent':`
			`# this record applies to this user agent`
			`if self.debug: print '>> user-agent:', line[1]`
			`active.append(line[1])`
			`if not self.rules.has_key(line[1]):`
			`self.rules[line[1]] = []`
			`elif line[0] == 'disallow':`
			`if line[1]:`
			`if self.debug: print '>> disallow:', line[1]`
			`for agent in active:`
			`self.rules[agent].append(regex.compile(line[1]))`
			`else:`
			`pass`
			`for agent in active:`
			`if self.debug: print '>> allow', agent`
			`self.rules[agent] = []`
			`else:`
			`if self.debug: print '>> unknown:', line`
Skip Montanaro's robots.txt parser. 1997-01-30 03:18:23 +00:00
Give in to tabnanny 1998-04-06 14:29:28 +00:00			`self.modified()`
Skip Montanaro's robots.txt parser. 1997-01-30 03:18:23 +00:00
			`# returns true if agent is allowed to fetch url`
			`def can_fetch(self, agent, url):`
Give in to tabnanny 1998-04-06 14:29:28 +00:00			`import urlparse`
			`ag = agent`
			`if not self.rules.has_key(ag): ag = '*'`
			`if not self.rules.has_key(ag):`
			`if self.debug: print '>> allowing', url, 'fetch by', agent`
			`return 1`
			`path = urlparse.urlparse(url)[2]`
			`for rule in self.rules[ag]:`
			`if rule.match(path) != -1:`
			`if self.debug: print '>> disallowing', url, 'fetch by', agent`
			`return 0`
			`if self.debug: print '>> allowing', url, 'fetch by', agent`
			`return 1`
Skip Montanaro's robots.txt parser. 1997-01-30 03:18:23 +00:00
			`def test():`
			`rp = RobotFileParser()`
			`rp.debug = 1`
			`rp.set_url('http://www.automatrix.com/robots.txt')`
			`rp.read()`
			`print rp.rules`
			`print rp.can_fetch('*', 'http://www.calendar.com/concerts/')`
			`print rp.can_fetch('Musi-Cal-Robot',`
Give in to tabnanny 1998-04-06 14:29:28 +00:00			`'http://dolphin:80/cgi-bin/music-search?performer=Rolling+Stones')`
Skip Montanaro's robots.txt parser. 1997-01-30 03:18:23 +00:00
			`print rp.can_fetch('Lycos', 'http://www/~skip/volkswagen/')`
			`print rp.can_fetch('Lycos', 'http://www/~skip/volkswagen/vanagon-list-001')`