# -*- coding: utf-8 -*-

_TaRgEt = 'local' and 'plone'

import xmlrpclib, re, sys, md5
from xml.sax import saxutils
from xml.sax import make_parser
from xml.sax.handler import feature_namespaces, ContentHandler
from datetime import datetime
from string import lower
from keywords import keywords_from_content
from base64 import b64encode, b64decode
import pickle
import threading, time, os, popen2



########## publishing stuff ##########

def tidy(content):
    p = popen2.Popen3("tidy -xml -utf8 -i -m -w 75 --drop-empty-paras y -q")
    p.tochild.write(content)
    p.tochild.flush()
    p.tochild.close()
    result = ''
    for line in p.fromchild:
        result += line.decode('utf-8')

    return result

def tidy_dummy(content):
    return content

class SequenceableProxy(xmlrpclib.ServerProxy):
    """
    We need that: list remove() keeps on calling xml-rpc for 
    object comparison :(
    """
    def __eq__(self, other):
        return id(self) == id(other)

class PublishTarget:
    def __init__(self, target):
        self.target = target.replace('\n', '')

    def xmlpost(self, dest, doc_id, title, content, type, format, state,
                descr, edate, subject, checksum):
        pass

    def deleteArticles(self, ids):
        pass

    def md5digest(self, folder):
        pass

class PloneTarget(PublishTarget):
    def __init__(self, target):
        PublishTarget.__init__(self, target)
        self.plone = SequenceableProxy(self.target)
        
    def xmlpost(self, dest, doc_id, title, content, type, format, state,
                descr, edate, subject, checksum):

        self.plone.xmlpost(dest, doc_id, title, content, type, format, state,
                           descr, edate, subject, checksum)

    def deleteArticles(self, ids):
        self.plone.deleteArticles(ids)

    def md5digest(self, folder):
        return self.plone.md5digest(folder)

class LocalTarget(PublishTarget):
    def xmlpost(self, dest, doc_id, title, content, type, format, state,
                descr, edate, subject, checksum):
        if not os.path.isdir(os.path.join(self.target, dest)):
            os.mkdir(os.path.join(self.target, dest))
        f = open(os.path.join(self.target, dest, doc_id + '.xml'), 'w')
        f.write(b64decode(content))
        f.close

    def deleteArticles(self, ids):
        pass

    def md5digest(self, folder):
        result = {}
        for root, dirs, files in os.walk(os.path.join(self.target, folder)):
            for file in files:
                f = open(os.path.join(root, file), 'r')
                result[folder + '/' + file] = md5.new(f.read()).hexdigest()
                f.close
        return pickle.dumps(result)

def publisher_factory():
    """Create a PublishTarget"""
    if _TaRgEt == 'plone':
        info = open('log-mdp.txt', 'r')
        log_mdp = info.read()
        info.close()
        return PloneTarget('http://' + log_mdp + '@127.0.0.1:6080/codes-et-lois/')
    else:
        return LocalTarget('/var/tmp')

MAX_THREADS = 4
max_threads = threading.Semaphore(MAX_THREADS)

_proxy_lock = threading.Lock()
_available_conn = []
_active_conn = []

def _get_connection():

    _proxy_lock.acquire()

    if _available_conn:
        conn = _available_conn.pop()
    else:
        conn = publisher_factory()
    _active_conn.append( conn )
    _proxy_lock.release()
    # give site users a chance to be scheduled
    # TODO: on le fera a la mise en prod...
    #time.sleep(MAX_THREADS)
    return conn

def _release_connection(conn):
    _proxy_lock.acquire()
    _available_conn.append( conn )
    _active_conn.remove( conn )
    _proxy_lock.release()

class PublishThread(threading.Thread):
    def __init__(self, dest, doc_id,
                 title, content, type, format, state, 
                 descr, edate, subject, checksum):
        threading.Thread.__init__(self)
        self.dest = dest
        self.doc_id = doc_id
        self.title = title
        self.content = content
        self.type = type
        self.format = format
        self.state = state
        self.descr = descr
        self.edate = edate
        self.subject = subject
        self.checksum = checksum

    def run(self):
        #print '%s publishing %s/%s' % (self.getName(), self.dest, self.doc_id)
        print 'updating %s/%s' % (self.dest, self.doc_id)
        conn = _get_connection()
        try:
            False or conn.xmlpost(self.dest, self.doc_id,
                                  self.title, self.content, self.type, self.format, self.state,
                                  self.descr, self.edate, self.subject, self.checksum)
        except Exception, error:
            print 'Error while publishing resource %s/%s, %s : %s' % (self.dest, self.doc_id, self.type, error)
        _release_connection(conn)
        max_threads.release()

class Publisher:

    def publish(self, type, folder, url, title, content, descr, subject):
        checksum = md5.new(content.encode('utf-8')).hexdigest()
        dest = folder + '/' + url
        edate = datetime.ctime(datetime.now())
        if not md5digest.has_key(dest) or md5digest[dest] != checksum:
            publisher = PublishThread(folder, url,
                                      b64encode(title.encode('utf-8')),
                                      b64encode(tidy_dummy(content).encode('utf-8')),
                                      type, 'text/html', 'publish',
                                      descr, edate,
                                      b64encode(subject.encode('utf-8')),
                                      checksum)
            max_threads.acquire()
            publisher.start()

        # remove from digest 
        if dest in md5digest:
            del md5digest[dest]

########## fonctions ##########

strip_bad_char = '\':"*^ _(),\t.'

# if an url is too long, first remove these words.
# warning: '-' are mandatory for the first and last words, for future '-'.join() sake.
unwanted_url_words = ['-titre', 'titre', 'livre', 'partie', 'chapitre', 'section', 'sous', 'paragraphe',
                      'reglementaire', 'legislative', 'decret', 'decrets', 'arrete', 'arretes', 'simples',
                      'annexe', 'annexes',
                      'du', 'de', 'la', 'le', 'les', 'des', 'd', 'l', 'a', 'aux', 'par', 'et',
                      'un', 'une', 'en', 'sont', 'c', 'sur',
                      'i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'x', 'xi', 'xii', 'xiii',
                      'xiv', 'xv',
                      'premiere', 'deuxieme', 'troisieme', 'quatrieme',
                      'cinquieme', 'sixieme', 'septieme', 'huitieme',
                      'bis', '1er', 'ier-',
                      ]

strip_unwanted_url_words = re.compile(r'\d|' + '-|-'.join(unwanted_url_words) )
strip_extra_hyphens = re.compile(r'-+')

def utftourl(text):
    str = ''
    for item in text:
        try:
            ascii = item.encode('ascii')
        except UnicodeError:
            ascii = CharMapping[item]
        if ascii not in strip_bad_char: 
            str += ascii
        else:
            str += '-'

    str = strip_extra_hyphens.sub('-', lower(str)).strip('-')

    if len(str) > 96:
        _str = ''
        __str = str
        while _str != __str:
            _str = __str
            __str = strip_extra_hyphens.sub('-', strip_unwanted_url_words.sub('-', _str))

        str = __str[0:min(63, len(__str))] + '-' + md5.md5(str).hexdigest()[0:7]

    return str

########## Classes ##########
class CodesEtLoisObject:
    def __init__(self, parent, title, url, descr, level):
        self.parent = parent
        self.title = title
        self.url = url
        self.level = level
        self.keywords = []
        self.children = []
        self.content = ''

    def depth(self):
        """Returns maximal depth of this branch"""
        if self.children:
            return max([child.depth() for child in self.children]) + 1
        else:
            return 0

    def toc_item(self):
        """Returns ToC entry for this object"""
        return '<div class="Niveau%d"><a href="%s">%s</a>' % (self.level, 
                                                              self.url,
                                                              self.title)

    def dump_toc(self):
        """Returns ToC for this object and its sub-objects"""
        result = self.toc_item()

        in_articles = False
        for child in self.children:
            if isinstance(child, Article):
                if not in_articles:
                  result += '<div class="Articles">\n'
                  in_articles = True
            else:
                if in_articles:
                    result += '</div>\n'
                    in_articles = False
            result += child.dump_toc()

        if in_articles:
            result += '</div>\n'

        return result + '</div>\n'

    def dump_integral(self):
        """Returns integral text"""

        result = self.toc_item()

        in_articles = False
        for child in self.children:
            if isinstance(child, Article):
                if not in_articles:
                  result += '<div class="Articles">\n'
                  in_articles = True
            else:
                if in_articles:
                    result += '</div> <!-- Articles -->\n'
                    in_articles = False
            result += child.dump_integral()

        if in_articles:
            result += '</div> <!-- Articles -->\n'

        return result + '</div>\n'

    def set_urls(self):
        for child in self.children:
            child.set_urls()

        parent = self
        url = ''
        while parent.parent:
            url = parent.title + '-' + url
            parent = parent.parent
        self.url = utftourl(url)

    def set_keywords(self):
        for child in self.children:
            child.set_keywords()
        keywords = {}

        for child in self.children:
            for k in child.keywords:
                if k in keywords:
                    keywords[k] += 1
                else:
                    keywords[k] = 0
        
        k = keywords.keys()
        k.sort(lambda a,b: keywords[b] - keywords[a])
        self.keywords = k[0:10]

    def set_prev(self, prev = None):
        for child in self.children:
            prev = child.set_prev(prev)
        return prev
        
    def set_next(self, next = None):
        for child in self.children[::-1]:
            next = child.set_next(next)
        return next

    def get_folder(self):
        """Returns destination folder"""
        parent = self
        while parent.parent:
            parent = parent.parent
        return utftourl(parent.title)

    def get_root_title(self):
        """Returns root title"""
        parent = self
        while parent.parent:
            parent = parent.parent
        return parent.title

    def newchild(self, level, title):
        if level == 'Article':
            obj = Article(self, title, self.level + 1)
            self.children.append(obj)
        elif level == 'TOC':
            obj = TOC(self, title, self.level + 1)
            self.children.append(obj)
        else:
            print "Warning: you must have done something wrong..."
            obj = None
        return obj

    def publish(self, publisher):
        pass

class Code (CodesEtLoisObject):
    def __init__(self, title):
        CodesEtLoisObject.__init__(self, None, title, '', '', 0)

    def toc_item(self):
        """Returns ToC entry for this object"""
        return """<div class="Niveau%d">
        <a href="%s">%s</a>
        <a href="texte-integral" title="[Texte Intégral]">
          <img src="book_icon.gif" alt="[Texte Intégral]"/>
        </a>
        """ % (self.level,
               self.url,
               self.title)

    def set_urls(self):
        self.url = 'toc'
        for child in self.children:
            child.set_urls()

    def publish(self, publisher):
        publisher.publish('Md5TOC', self.get_folder(), 'texte-integral', self.title,
                          self.dump_integral(), self.get_root_title(), ';'.join(self.keywords) )

        publisher.publish('Md5TOC', self.get_folder(), 'toc', self.title,
                          self.dump_toc(), self.get_root_title(), ';'.join(self.keywords) )

        for child in self.children:
            child.publish(publisher)

class TOC (CodesEtLoisObject):
    def __init__(self, parent, title, level):
        CodesEtLoisObject.__init__(self, parent, title, '', '', level)

    def toc_item(self):
        """Returns ToC entry for this object"""
        if self.level < 2 or self.depth() == 1:
            return """<div class="Niveau%d">
            <a href="%s">%s</a>
            <a href="%s-texte-integral" title="[Texte Intégral]">
             <img src="book_icon.gif" alt="[Texte Intégral]"/>
            </a>
            """ % (self.level,
                   self.url,
                   self.title,
                   self.url)
        else:
            return CodesEtLoisObject.toc_item(self)

    def set_urls(self):
        for child in self.children:
            child.set_urls()

        parent = self
        url = ''
        while parent.parent:
            url = parent.title + '-' + url
            parent = parent.parent
        self.url = utftourl('toc-' + url)

    def publish(self, publisher):
        parent = self.parent
        header = ''
        while parent:
            header = parent.toc_item() + header
            parent = parent.parent

        footer = ''
        parent = self.parent
        while parent:
            footer += '</div>\n'
            parent = parent.parent

        content = header + self.dump_toc() + footer

        publisher.publish('Md5TOC', self.get_folder(), self.url, self.title,
                          content, self.title + ' du ' + self.get_root_title(), ';'.join(self.keywords) )

        if self.level < 2 or self.depth() == 1:
            content = header + self.dump_integral() + footer
            publisher.publish('Md5TOC', self.get_folder(), self.url + '-texte-integral', self.title,
                              content, self.title + ' du ' + self.get_root_title() + ' - Texte Intégral',
                              ';'.join(self.keywords) )

        for child in self.children:
            child.publish(publisher)

class Article (CodesEtLoisObject):
    def __init__(self, parent, title, level):
        CodesEtLoisObject.__init__(self, parent, title, '', '', level)
        self.histo = []
        self.notes = []
        self.prev = None
        self.next = None

    def depth(self):
        """Returns maximal depth of this branch"""
        return 0

    def set_prev(self, prev = None):
        self.prev = prev
        return self

    def set_next(self, next = None):
        self.next = next
        return self

    def toc_item(self):
        """Returns ToC entry for this object"""
        return '<div class="Article"><a href="%s">%s</a>' % (self.url, self.title)

    def dump_integral(self):
        histo = ''
        for h in self.histo:
            histo += '<div class="ArticleHisto">%s</div>\n' % h

        notes = ''
        for n in self.notes:
            notes += '<div class="ArticleNotes">%s</div>\n' % n

        content = '<p>' + '</p><p>'.join(self.content.strip().split('\n')) + '</p>'

        result = """
              <div class="Article">
                <div class="ArticleTitle"><a href="%s">%s</a></div>
                %s
                %s
                %s
              </div>
              """ % (self.url, self.title, histo, content, notes)

        return result

    def dump_article(self):
        parent = self.parent
        header = ''
        while parent:
            header = parent.toc_item() + header
            parent = parent.parent

        footer = ''
        parent = self.parent
        while parent:
            footer += '</div>\n'
            parent = parent.parent

        histo = ''
        for h in self.histo:
            histo += '<div class="ArticleHisto">%s</div>\n' % h

        notes = ''
        for n in self.notes:
            notes += '<div class="ArticleNotes">%s</div>\n' % n

        if self.prev:
            prev = """<span class="articlePrevious">
                     <a tabindex="2" href="%s"><span>Article précédent :&nbsp;</span>%s</a>
                   </span>""" % (self.prev.url, self.prev.title)
        else:
            prev = ''

        if self.next:
            next = """<span class="articleNext">
                     <a tabindex="2" href="%s"><span>Article suivant :&nbsp;</span>%s</a>
                   </span>""" % (self.next.url, self.next.title)
        else:
            next = ''

        content = '<p>' + '</p><p>'.join(self.content.strip().split('\n')) + '</p>'

        result = """
              %s
              <div class="Article">
                <div class="ArticleTitle">%s</div>
                %s
                %s
                %s
                 <div class="articleFooter">
                %s%s
                 </div> 
              </div>
              %s
              """ % (header, self.title, histo, content, notes, prev, next, footer)
        return result

    def set_urls(self):
        self.url = utftourl(self.title)

    def set_keywords(self):
        self.keywords = keywords_from_content(self.content)

    def publish(self, publisher):
        publisher.publish('Md5Article', self.get_folder(), self.url, self.title, 
                          self.dump_article(), self.title + ' du ' + self.get_root_title(), ';'.join(self.keywords) )

class ParseCode(ContentHandler):
    def __init__(self, base_url = ''):
        self.root = None
        self.currentObject = None
        self.inHistoContent = False
        self.inNoteContent = False
        self.link = False
        self.inParContent = False
        self.title = ''
        self.text = ''
        self.notes = ''
        self.histo = ''

    def dump(self):
        self.root.dump()

    def dump_integral(self):
        return self.root.dump_integral()

    def startElement(self, name, attrs):
        """
        fonction appelee en debut de tag
        """
        # If it's in [ignore], ignore it
        if name in ignore:
            return

        self.title = attrs.get('title')
        if self.title: self.title = self.title.strip().strip('.')

        if name == 'Code':
            self.root = Code(self.title)
            self.currentObject = self.root

        elif name in ['niveau1', 'niveau2', 'niveau3', 'niveau4',
                      'niveau5', 'niveau6', 'niveau7', 'niveau8']:
            self.currentObject = self.currentObject.newchild('TOC', self.title)

        elif name == 'article':
            self.currentObject = self.currentObject.newchild('Article', self.title)
            self.inParContent = True

        elif name == 'Histo':
            self.inHistoContent = True

        elif name == 'Note':
            self.inNoteContent = True

        elif name == 'a':
            self.link = True
            self.link_text = ''
            self.link_url = attrs.get('href')

    def characters(self, elt_content):
        """
        fonction appelee si le parser rencontre des donnees dans un element
        si on est dans le corps ou dans une note, on remplie le fichier
        """
        if elt_content != '':
            if self.link == True:
                self.link_text += elt_content

            elif self.inHistoContent == True:
                self.histo += elt_content

            elif self.inNoteContent == True:
                self.notes += elt_content

            elif self.inParContent == True:
                self.text += elt_content

    def endElement(self, name):
        """
        fonction appelee en fin de tag
        on ferme les differents fichiers
        """
        if name in ignore:
            return

        if name == 'Code':
            pass

        elif name in ['niveau1', 'niveau2', 'niveau3', 'niveau4',
                      'niveau5', 'niveau6', 'niveau7', 'niveau8']:
            self.currentObject.content = self.text
            self.currentObject = self.currentObject.parent

        elif name == 'article':
            self.currentObject.content = self.text
            self.text = ''
            self.inParContent = False
            self.currentObject = self.currentObject.parent
            
        elif name == 'Histo':
            self.inHistoContent = False
            self.currentObject.histo.append(self.histo)
            self.histo = ''

        elif name == 'Note':
            self.inNoteContent = False
            self.currentObject.notes.append(self.notes)
            self.notes = ''

        elif name == 'a':
            self.link = False
            self.text += '<a href="%s">%s</a> ' % (self.link_url, self.link_text)

 
########## variables globales ##########
ignore = ['Articles']

CharMapping = {
    u'\u0152': 'oe',
    u'\u0153': 'oe',
    u'\u2015': '-',
    u'\xb0': 'o',
    u'\xc0': 'a',
    u'\xc2': 'a',
    u'\xc7': 'c',
    u'\xc8': 'e',
    u'\xc9': 'e',
    u'\xca': 'e',
    u'\xce': 'i',
    u'\xd4': 'o',
    u'\xe0': 'a',
    u'\xe2': 'a',
    u'\xe7': 'c',
    u'\xe8': 'e',
    u'\xe9': 'e',
    u'\xea': 'e',
    u'\xee': 'i',
    u'\xef': 'i',
    u'\xf4': 'o',
    u'\xf9': 'u',
    u'\xfb': 'u',
    }

########## MAIN ##########
# on ouvre le fichier en argument

file = open(sys.argv[1], "rb")

# on parse le xml produit par download
parser = make_parser()

# Tell the parser we are not interested in XML namespaces
parser.setFeature(feature_namespaces, 0)

# Create the handler
dh = ParseCode()

# Tell the parser to use our handler
parser.setContentHandler(dh)

# Parse the input
parser.parse(file)

for i in range(1, MAX_THREADS):
    max_threads.acquire()

file.close()


dh.root.set_urls()
dh.root.set_keywords()
dh.root.set_prev()
dh.root.set_next()

plone = publisher_factory()
md5digest = pickle.loads(plone.md5digest(dh.root.get_folder()))

dh.root.publish( Publisher() )

# delete spurious documents
plone.deleteArticles( md5digest.keys() )



