#!/usr/bin/python
"""MachineCode interpreter.

Interprets and caches MachineCode from XHTML pages.

DataBlockStore -- Create DataBlock instances.
DataBlock -- A MachineCode block on a website.
"""

import re

import pprint
import HTMLParser, htmlentitydefs

import webcache

import mcb_rdf

class DataBlockStore:

    """Creates and caches data blocks.

    When you want a DataBlock, call "get_block". It will return a cached
    block, or a new block.

    The DataBlockStore also maintains a shared webpage cache.

    get_block -- Obtain a DataBlock 
    """
    
    def __init__(self, directory = "."):
        self.cache = webcache.WebCache(directory + "/pages.db",
                                       directory + "/times.db",
                                       24*60*60)  # Shared webpage cache
        self.blocks = {}  # Blocks cache
    
    def get_blocks(self, url, baseurl=None):
        """Return a DataBlock representing a website's MachineCode.
        
        If the DataBlock has already been made, return it. If not,
        construct it, cache it, and then return it.
        """
        if not url.startswith("http://") and baseurl:
            if url[0]=="#":
                name = url[1:]
            else:
                name = url
            url = baseurl
        else:
            name = None
            url = url.split("#", 1)
            if len(url)==2:
                name = url[1]
            url = url[0]

        if not url in self.blocks:
            self.read_page(url)

        if name:
            result = []
            for block in self.blocks[url]:
                if block.get("id", [""])[0]==name:
                    result.append(block)
            return result
        else:
            return self.blocks[url]

    def read_page(self, url):
        page = self.cache.get_page(url)
        parser = MachineCodeHtmlParser()
        parser.feed(page)
        parser.close()
        self.blocks[url] = parser.blocks
        for block in self.blocks[url]:
            block._url = url
            #block.apply_type(self)
            #print block.get_required_attributes(self)

class DataBlock(dict):
    strict_types = 0

    def store_value(self, key, value):
        if self.has_key(key):
            self[key].append(value)
        else:
            self[key] = [value]

    def get_required_attributes(self, type):

        if not type: return []
        attributes = type.get("attribute", [])
        result = []
        for attr in attributes:
            if int(attr.get("required", [1])[0]):
                if attr.has_key("key"):
                    result.append(attr["key"][0])
        return result

    def get_type(self, store):
        type = self.get("type", None)
        if not type: return None
        type = type[0]
        if isinstance(type, str):
            type = store.get_blocks(type, self._url)
            if not type: return None
            type = type[0]
        return type

    def get_type_attribute(self, type, key):
        for attr in type.get('attribute', []):
            k = attr.get('key',[])
            if k and k[0]==key: return attr
        return None

    def _handle_string(self, value):
        if not isinstance(value, str) and self.strict_types:
            return None
        else:
            return value
    def _handle_number(self, value):
        try:
            return float(value)
        except ValueError:
            print value
            if self.strict_types:
                return None
            else:
                return value
    def _handle_date(self, value):
        return value #XXX
    def _handle_url(self, value):
        value = self._handle_string(value)
        # XXX href replacement ._url extention
        return value
    def _handle_any(self, value):
        return value
    
    def _Handle_block(self, value, block_type, store):
        if isinstance(value, DataBlock):
            value._url = self._url
            type = value.get_type(store)
            if type is block_type:
                return value
            elif type is None:
                value['type'] = [block_type]
            elif self.strict_types:
                return None
            value.apply_type(store)
        elif self.strict_types:
            return None
        return value
        
        
    def apply_type(self, store):
        type = self.get_type(store)
        if not type: return
        #req_type = self.get_required_attributes(type)
        for key, values in self.iteritems():
            attribute = self.get_type_attribute(type, key)
            if attribute:
                attr_type = attribute.get("type", ['any'])[0]
                if key=='multiple': print "XX\n", self, '\n', attr_type
                if isinstance(attr_type, str):
                    attr_type = attr_type.strip()
                    handler = getattr(self, "_handle_%s" % attr_type,
                                      None)
                    if not handler:
                        type_block = store.get_blocks(attr_type, self._url)
                        if type_block:
                            type_block = type_block[0]
                            handler = lambda v: self._Handle_block(
                                v, type_block, store)
                        else:
                            handler = self._handle_any #XXX Blocks
                elif isinstance(attr_type, DataBlock):
                    handler = lambda v: self._Handle_block(
                        v, attr_type, store)
                    
                values = map(handler, values)

                if self.strict_types:
                    values = filter(lambda x:x is not None, values)
                self[key] = values
                
class MachineCodeHtmlParser(HTMLParser.HTMLParser):

    """Interpret XHTML tag events and store the results in a DataBlock.

    A SAX-like handler for XHTML events.

    The handler waits for the text "MACHINECODE" in bold.

    Then it reads bold-tagged text as dictionary keys, and italic-tagged
    text or anchored text as dictionary values.

    A final "MACHINECODE" in bold seals the interpretation.
    """

    beginblock = "BEGINBLOCK"
    endblock_re = re.compile("\s*ENDBLOCK", re.M)
    delim_re = re.compile(r"\s*([:$%])")
    delim_quoted = ":"
    delim_raw = '%'
    delim_block = "$"
    delim_end = re.compile(";(?!;)")
    key_re = re.compile(r"[\w-]+", re.M)
    block_search = object()
    key_search = object()
    delim_search = object()
    raw_html = object()
    unquote_html = object()
    
    def __init__(self):
        HTMLParser.HTMLParser.__init__(self)
        
        self.data_block = None
        
        self.state = self.block_search 
        self.value = []
        self.stack = []
        self.blocks = []
    def handle_starttag(self, tag, attrs):
        if self.state==self.raw_html:
            a = []
            for key, value in attrs:
                a.append(' %s="%s"' % (key, value))
            self.value.append("<%s%s>" % (tag, "".join(a)))

    def handle_endtag(self, tag):
        if self.state==self.raw_html:
            self.value.append("</%s>" % tag)

    def handle_data(self, data):
        pos = 0

        while pos<len(data):
            # search BEGINBLOCK
            if self.state==self.block_search:
                pos = data.find(self.beginblock, pos)
                if pos != -1:
                    print "\nBEGINBLOCK"
                    pos += len(self.beginblock)
                    self.state = self.key_search
                    if self.data_block: self.stack.append(self.data_block)
                    self.data_block = DataBlock()
                else:
                    return

            # search KEY
            if self.state==self.key_search:
                # ENDBLOCK
                match = self.endblock_re.match(data, pos)
                if match:
                    print "ENDBLOCK"
                    pos = match.end()
                    if self.stack:
                        value_block = self.data_block
                        self.data_block = self.stack.pop()
                        self.data_block.store_value(self.data_block.key, value_block)
                        self.state = self.key_search
                    else:
                        self.state = self.block_search
                        self.blocks.append(self.data_block)
                        self.data_block = None
                # KEY
                else:
                    match = self.key_re.search(data, pos)
                    if match:
                        self.data_block.key = match.group()
                        self.state = self.delim_search
                        print "KEY:", self.data_block.key, 
                        pos = match.end()
                    else:
                        return

            # search DELIMITER
            if self.state == self.delim_search:
                match = self.delim_re.match(data, pos)
                if match:
                    pos = match.end(1)
                    if match.group(1)==self.delim_raw:
                        self.state=self.raw_html
                    elif match.group(1)==self.delim_quoted:
                        self.state=self.unquote_html
                    elif match.group(1)==self.delim_block:
                        self.state=self.block_search
                    print match.group(1),
                else:
                    return

            # search VALUE/ ENDDELIMITER ;
            if self.state in [self.raw_html, self.unquote_html]:
                match = self.delim_end.search(data, pos)
                if match:
                    self.value.append(data[pos:match.start(0)])
                    pos = match.end(0)
                    value = "".join(self.value)
                    self.value = []
                    print value
                    self.data_block.store_value(self.data_block.key, value)
                    self.state = self.key_search
                else:
                    self.value.append(data[pos:])
                    return

    def handle_entityref(self, name):
        if self.state==self.raw_html:
            self.value.append("&%s;" % name)
        elif self.state==self.unquote_html:
            self.value.append(htmlentitydefs.entitydefs[name])
        # XXX break keys?

    def handle_charref(self, name):
        if self.state==self.raw_html:
            self.value.append("&#%s;" % name)
            return
        
        if name[0]=="x":
            character = unichr(int(name[1:], 16))
        else:
            character = unichr(int(name))
        

        if self.state==self.value:
            self.value.append(character)
        # XXX encoded delimiter, key chars, ...
    
if __name__ == "__main__":

    store = DataBlockStore()
    #mc = store.get_block("http://www.emacswiki.org/cw/CommunityWiki")
    #for blck in mc: pprint.pprint(blck)
    #mc = store.get_blocks("http://moinmoin.wikiwikiweb.de/MachineCodeBlocks3/MetaSchema")
    mc = store.get_blocks("http://moinmoin.wikiwikiweb.de/MachineCodeBlocks3/RdfIntegration")
    print "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
    rdf = mcb_rdf.RDF_World()
    for blck in mc:
        pprint.pprint(blck)
        rdf.mcb_rdf(blck)

    #print rdf.store.serialize(format="pretty-xml")
