#!/usr/bin/python """CommentPuller -- trivial class to extract comments from HTML pages""" import HTMLParser import urllib import sys __author__ = "Paul Bissex " __version__ = "0.1" __license__ = "MIT" __usage__ = """Shell usage: CommentPuller.py Python usage: comment_list = CommentPuller('http://example.com').comments """ class CommentPuller(HTMLParser.HTMLParser): """Class for pulling comments out of HTML pages""" def __init__(self, url): """Set up object and fetch page HTML""" HTMLParser.HTMLParser.__init__(self) self.html = urllib.urlopen(url).read() self.comments = [] self.feed(self.html) self.close() def handle_comment(self, data): """Override HTMLParser.handle_comment; collect comment strings""" self.comments.append(data.strip()) if __name__ == "__main__": try: url = sys.argv[1] except IndexError: print __usage__ sys.exit() comment_list = CommentPuller(url).comments for comment in comment_list: print comment