#!/usr/bin/env python
from xml.sax import make_parser, parse
from xml.sax.handler import ContentHandler
import os.path, StringIO, urllib

import sqlite

# XML is based on Unicode, so we try to use Unicode in this example program
# wherever feasible.

# These are the attributes of the <project> XML element in the data file.
PROJECT_ATTRS = [u"project_id", u"date_added", u"date_updated",
                 u"projectname_short", u"projectname_full", u"desc_short",
                 u"desc_full", u"vitality_score", u"vitality_percent",
                 u"popularity_score", u"popularity_percent", u"rating",
                 u"rating_count", u"branch_name", u"url_homepage", u"url_tgz",
                 u"url_changelog", u"url_rpm", u"url_deb", u"url_bz2",
                 u"url_cvs", u"url_list", u"url_slp", u"url_zip", u"license",
                 u"latest_version"]

class Project:
    """A fancy class representing a Freshmeat project.
    It can render itself as XML again."""

    def __init__(self):
        # Use exactly the same attributes as the XML element.
        # All attributes default to None.
        for name in PROJECT_ATTRS:
            self.__dict__[name] = None

    def toxml(self):
        """Render the project as a <project> XML element."""
        s = StringIO.StringIO()
        s.write(u"<project>\n")
        for k, v in self.__dict__.items():
            s.write(u"  <%s>%s</%s>\n" % (k, v or u"", k))
        s.write(u"</project>\n")
        s.seek(0)
        return s.read()

class SaxHandler(ContentHandler):
    """This is the SAX handler for parsing the XML.
    We need to code a little state machine in the handler."""

    def __init__(self, cursor):
        ContentHandler.__init__(self)
        self.cursor = cursor
        self.have_shown_sql = 0
        self.counter = 0

    def startElement(self, name, attrs):
        """This handler is called for every opening XML element."""
        if name == u"project":
            self.current_project = Project()
        elif name in PROJECT_ATTRS:
            self.current_element = name
            self.current_buffer = u"" 
        else:
            # Ignore unknown XML elements
            self.current_element = None

    def endElement(self, name):
        """This handler is called for every closing XML element."""
        if name == u"project":
            self.db_handler()
        elif name in PROJECT_ATTRS:
            if self.current_element != u"":
                self.current_project.__dict__[self.current_element] = self.current_buffer

    def characters(self, s):
        """This handler is called for a chunk of character data."""
        if self.current_element is not None:
            self.current_buffer += s

    def db_handler(self):
        """Persist the current project into the database."""
        # We use a few tricks to construct the SQL query string. We exploit
        # the fact that the attributes of a Project object have the same
        # name as the columns in the database table.

        # Concatenate the column names, seperated by commata
        project_columns = ", ".join(PROJECT_ATTRS)

        # Create a template for named string substitution.
        value_str =  ", ".join(["%(" + attr + ")s" for attr in PROJECT_ATTRS])

        sql = "insert into project (%s) values (%s)" % (project_columns, value_str)

        # For illustration purposes, show the resulting SQL string, but only once.
        if not self.have_shown_sql:
            print "This is the SQL string for the INSERT statement:"
            print sql
            self.have_shown_sql = 1

        # We exploit the 'pyformat' quoting style of PySQLite here,
        # and just supply a dictionary as second parameter to execute:
        self.cursor.execute(sql, self.current_project.__dict__)

        self.counter += 1
        if self.counter % 100 == 0:
            print "Successfully imported project #%i: %s." \
                    % (self.counter, self.current_project.projectname_short)

def download_file():
    print "The file %s doesn't exist in the working directory."
    print "I can now download the XML sources from freshmeat.net for you"
    print

    choice = ' '
    while choice not in "cuq":
        print "c - Download bzip2 compressed file and uncompresss it. (ca 3.2 MB - need bunzip2 in PATH)"
        print "d - Download uncompressed file (ca. 32.3 MB)"
        print "q - Quit"
        try:
            choice = raw_input()[0].lower()
        except:
            choice = ' '
    
    if choice == "c":
        print "Fetching file ..."
        urllib.urlretrieve("http://freshmeat.net/backend/fm-projects.rdf.bz2",
            "fm-projects.rdf.bz2")

        print "Uncompressing file ..."
        os.system("bunzip2 fm-projects.rdf.bz2")
    elif choice == "d":
        print "Fetching file ..."
        urllib.urlretrieve("http://freshmeat.net/backend/fm-projects.rdf",
            "fm-projects.rdf.bz2")
    elif choice == "q":
        return 0
    return 1

def main():
    FILENAME = "fm-projects.rdf"
    if not os.path.exists(FILENAME):
        if not download_file():
            return

    # Since we're dealing with Unicode strings, we must tell PySQLite
    # to which 8-bit encoding it shall encode them to. The input from
    # Freshmeat is in the iso-8859-1 aka latin1 encoding.
    conn = sqlite.connect("db", client_encoding="iso-8859-1")
    cursor = conn.cursor()

    cursor.execute("create table project (%s)" % (", ".join(PROJECT_ATTRS),))

    parser = make_parser()
    handler = SaxHandler(cursor)

    parse(FILENAME, handler)

    conn.commit()
    conn.close()

if __name__ == "__main__":
    main()
