#!/usr/bin/pythonw """ XO.com Site Builder content extraction tools The hosting company XO.com has, or had, a tool called "Site Builder" that let users create and edit pages in a web browser. They seem to be abandoning it. If you have content locked up in Site Builder files, this utility can help you get it out and into a relational database. Download the directory "files/sys-tmpl/nss-objects" from your XO account. Run this script, pick that directory, then pick a directory for the output. In addition to SQL output there's also an HTML option for producing a rough static version of the pages. This is by no means a polished or complete script, but it should be good enough to pull your data from that particular sinking ship. """ __author__ = "Paul Bissex " __version__ = "0.3.1" __license__ = "GPL " import os import re import sys import types import urllib import MySQLdb import EasyDialogs class SiteBuilderParser: """Tools to extract story data from a downloaded 'nss-objects' directory""" def __init__(self, input, output, GUI=False): """Get directory names, which are also story names. Before calling, configure self.filters, self.mapping, and self.page_template as desired. Omit trailing slash on dir parameter.""" self.root = input self.GUI = GUI self.output_dir = output # store names of all non-dot dirs in input self.story_dirs = [dir for dir in os.listdir(self.root) if dir.strip() and dir[0] != '.' and os.path.isdir(os.path.join(self.root, dir))] # initialize page content dict self.pages = {} # filter functions for values that need translation # TODO: use our internal keys, not NSS keys # TODO: multiple filters (for body, mostly) page_href_cleanup = (lambda x: re.sub("http://www.jonreed.net.cnchost.com/sys-tmpl", "", x)) img_src_cleanup = (lambda x: re.sub("http://www.jonreed.net.cnchost.com/", "images/", x)) self.filters = { 'HTML': (urllib.unquote, page_href_cleanup, img_src_cleanup), 'PageTitle': urllib.unquote, 'PageName': urllib.unquote, 'Access-PublicRead': (lambda x: x == 'on'), } # map NSS keys to keys in our content DB self.mapping = [ ('ObjectUID', 'ID'), ('Access-PublicRead', 'publish'), ('PageTitle', 'full_title'), ('PageName', 'short_title'), ('HTML', 'body') ] # template for page rendering self.page_template = "%(short_title)s

%(full_title)s

%(body)s" self.sql_template = "INSERT INTO %s VALUES ('%s', '%s', '%s', '%s', '%s');\n" self.db_table = "content" if self.GUI: self.bar = EasyDialogs.ProgressBar(title="Processing Sitebuilder files") def bar_inc(self): if not self.GUI: return self.bar.inc() def bar_setup(self, label="Processing...", start=0, end=100): if not self.GUI: return self.bar.set(start, max=end) self.bar.label(label) def bar_close(self): if not self.GUI: return del(self.bar) def get_output_dir(self, base): """Generate path to output directory (alongside input directory)""" return os.path.join(os.path.split(base)[0], "html_output") def fetch_story_lines(self, story_name): """Fetch the lines of an individual '!object' file""" path = os.path.join(self.root, story_name, '!object') file = open(path) raw_contents = file.read() return raw_contents.split('\n') def fetch_story_bits(self, story_name): """Chop each line into a key/value pair""" lines = self.fetch_story_lines(story_name) contents = {} for line in lines: # each line contains key, space, value if line.strip(): (key, value) = line.split(' ', 1) # strip initial hashmark from key; strip quotes from value contents[key[1:]] = value.strip('"') return contents def store_story(self, story_name): raw_bits = self.fetch_story_bits(story_name) bits = {} for (nss_key, db_key) in self.mapping: # fetch value; make it blank if nothing's there try: value = raw_bits[nss_key] except KeyError: value = '' # filter the value, if appropriate try: f = self.filters[nss_key] if type(f) == types.FunctionType: value = f(value) elif type(f) == types.TupleType: for func in f: value = func(value) except KeyError: pass bits[db_key] = value self.pages[story_name] = bits self.tag = story_name def render_page(self, story_name): """Return HTML for a single page""" self.store_story(story_name) html = self.page_template % self.pages[story_name] # TODO: clean up links and img refs, etc. return html def render_sql_insert(self, story_name): """Return SQL for a single page""" self.store_story(story_name) for key in self.pages[story_name].keys(): if key != "publish": # special case for boolean -- hackish self.pages[story_name][key] = MySQLdb.escape_string(self.pages[story_name][key]) page_data = self.pages[story_name].values() try: sql = self.sql_template % tuple([self.db_table] + page_data) except TypeError, e: print "Error:", e print page_data sys.exit() return sql def render_index_page(self): """Create a simple index page for all listed pages""" index_html = "" index_line_template = "
  • %s
  • " for page, bits in self.pages.items(): title = bits['full_title'] or bits['short_title'] if bits['publish'] and title: index_html += index_line_template % (page, title) index_file = os.path.join(self.output_dir,"index.html") index = open(index_file, "w") index.write(index_html) index.close() def render_all_pages(self): """Write files for all pages, plus index""" try: os.mkdir(self.output_dir) except OSError, e: if e[1] == 'File exists': pass # directory exists, that's OK else: print "Error when trying to create directory:", e sys.exit() self.bar_setup("Rendering HTML pages...", 0, len(self.story_dirs)) for story_name in self.story_dirs: html = self.render_page(story_name) story_file = os.path.join(self.output_dir, story_name + ".html") out = open(story_file, "w") out.write(html) out.close() self.bar_inc() self.render_index_page() def render_all_sql(self): """Create SQL file for page data""" sql_file = story_file = os.path.join(self.output_dir, "stories.sql") out = open(sql_file, "w") self.bar_setup("Rendering SQL database code...", 0, len(self.story_dirs)) for story_name in self.story_dirs: sql_line = self.render_sql_insert(story_name) out.write(sql_line) self.bar_inc() out.close() def cleanup(self): if self.GUI: del(self.bar) def choose_folder(prompt="Please select the folder"): """Ask user to choose a folder; return as string""" folder = EasyDialogs.AskFolder(message=prompt) return folder def ask_output_type(options): """Use yes/no dialog to ask about output. slightly awkward.""" reply = EasyDialogs.AskYesNoCancel(question="which?", default=1, no=options[0], yes=options[1]) if reply == -1: # cancel return None else: print "Chose option %d: %s" % (reply, options[reply]) return options[reply] if __name__ == "__main__": nss_dir = choose_folder("Please select NSS-Objects directory") output_dir = choose_folder("Please select output directory") output_type = ask_output_type(['HTML', 'SQL']) parser = SiteBuilderParser(input=nss_dir, output=output_dir, GUI=True) if output_type == "SQL": parser.render_all_sql() elif output_type == "HTML": parser.render_all_pages() else: print "Unrecognized output type" parser.cleanup()