# -*- coding: utf-8 -*-

_TaRgEt = 'local'# or 'plone'

plone_targets = ['http://%s@127.0.0.1:6080/codes-et-lois/',
                 'http://%s@10.0.27.3:6080/codes-et-lois/']

import xmlrpclib, re, sys, md5
from xml.sax import saxutils
from xml.sax import make_parser
from xml.sax.handler import feature_namespaces, ContentHandler
from datetime import datetime, timedelta
from base64 import b64encode, b64decode
import pickle
import threading, time, os, popen2
from random import choice
from sqlobject import *

from utftourl import utftourl
from keywords import keywords_from_content
from concordance_travail import concordance_travail, concordance_travail_reverse

########## database stuff ##########

sqlhub.processConnection = connectionForURI('sqlite:/:memory:')

class CodeTable(SQLObject):
    Url = StringCol()

CodeTable.createTable()

class Citation(SQLObject):
    fromUrl = StringCol()
    toUrl = StringCol()

Citation.createTable()

########## publishing stuff ##########

def tidy(content):
    p = popen2.Popen3("tidy -xml -utf8 -i -m -w 75 --drop-empty-paras y -q")
    p.tochild.write(content)
    p.tochild.flush()
    p.tochild.close()
    result = ''
    for line in p.fromchild:
        result += line.decode('utf-8')

    return result

def tidy_dummy(content):
    return content

class SequenceableProxy(xmlrpclib.ServerProxy):
    """
    We need that: list remove() keeps on calling xml-rpc for 
    object comparison :(
    """
    def __eq__(self, other):
        return id(self) == id(other)

class PublishTarget:
    def __init__(self, target):
        self.target = target.replace('\n', '')

    def xmlpost(self, dest, doc_id, title, content, type, format, state,
                descr, edate, subject, checksum):
        pass

    def deleteArticles(self, ids):
        pass

    def md5digest(self, folder):
        pass

class PloneTarget(PublishTarget):
    def __init__(self, target):
        PublishTarget.__init__(self, target)
        self.plone = SequenceableProxy(self.target)
        
    def xmlpost(self, dest, doc_id, title, content, type, format, state,
                descr, edate, subject, checksum):

        self.plone.xmlpost(dest, doc_id, title, content, type, format, state,
                           descr, edate, subject, checksum)

    def deleteArticles(self, ids):
        self.plone.deleteArticles(ids)

    def md5digest(self, folder):
        return self.plone.md5digest(folder)

class LocalTarget(PublishTarget):
    def xmlpost(self, dest, doc_id, title, content, type, format, state,
                descr, edate, subject, checksum):
        if not os.path.isdir(os.path.join(self.target, dest)):
            os.mkdir(os.path.join(self.target, dest))
        f = open(os.path.join(self.target, dest, doc_id + '.xml'), 'w')
        f.write(b64decode(content))
        f.close

    def deleteArticles(self, ids):
        pass

    def md5digest(self, folder):
        result = {}
        for root, dirs, files in os.walk(os.path.join(self.target, folder)):
            for file in files:
                f = open(os.path.join(root, file), 'r')
                result[folder + '/' + file] = md5.new(f.read()).hexdigest()
                f.close
        return pickle.dumps(result)

def publisher_factory():
    """Create a PublishTarget"""
    if _TaRgEt == 'plone':
        info = open('log-mdp.txt', 'r')
        log_mdp = info.read()
        info.close()
        target = choice(plone_targets) % log_mdp
        return PloneTarget(target)
    else:
        return LocalTarget('/var/tmp')

MAX_THREADS = 6
max_threads = threading.Semaphore(MAX_THREADS)

_proxy_lock = threading.Lock()
_available_conn = []
_active_conn = []

def _get_connection():

    _proxy_lock.acquire()

    if _available_conn:
        conn = _available_conn.pop()
    else:
        conn = publisher_factory()
    _active_conn.append( conn )

    _proxy_lock.release()
    # give site users a chance to be scheduled
    # TODO: on le fera a la mise en prod...
    #time.sleep(MAX_THREADS)
    return conn

def _release_connection(conn):
    _proxy_lock.acquire()
    _available_conn.append( conn )
    _active_conn.remove( conn )
    _proxy_lock.release()

update_statistics_lock = threading.Lock()
update_statistics = { 'uploaded articles': 0,
                      'uploaded bytes': 0,
                      'total upload time': timedelta(),
                      'total publish time': timedelta()
                      }
                      
def print_stats(dest):
    update_statistics_lock.acquire()

    _dur = update_statistics['total upload time']
    duration = (_dur.days * 3600 * 24 + _dur.seconds) * 1000.0 + _dur.microseconds / 1000.0 + 0.1

    #print '%(uploaded articles)s\t%(uploaded bytes)s\t%(total upload time)s' % update_statistics
    print '%i\t%60s\t%.1f articles/s\t%.1f KB/s' %( update_statistics['uploaded articles'] ,
                                                    dest[-60:],
                                                    update_statistics['uploaded articles'] * 1000.0 / duration,
                                                    (update_statistics['uploaded bytes'] * 1000.0 / 1024.0) / duration )
    update_statistics_lock.release()

class PublishThread(threading.Thread):
    def __init__(self, dest, doc_id,
                 title, content, type, format, state, 
                 descr, edate, subject, checksum):
        threading.Thread.__init__(self)
        self.dest = dest
        self.doc_id = doc_id
        self.title = title
        self.content = content
        self.type = type
        self.format = format
        self.state = state
        self.descr = descr
        self.edate = edate
        self.subject = subject
        self.checksum = checksum

    def run(self):
        #print '%s publishing %s/%s' % (self.getName(), self.dest, self.doc_id)
        #print 'updating %s/%s' % (self.dest, self.doc_id)
        
        conn = _get_connection()
        try:
            start = datetime.now()

            False or conn.xmlpost(self.dest, self.doc_id,
                                  self.title, self.content, self.type, self.format, self.state,
                                  self.descr, self.edate, self.subject, self.checksum)
            end = datetime.now()
            update_statistics_lock.acquire()
            update_statistics['uploaded articles'] += 1
            update_statistics['uploaded bytes'] += len(self.title + self.content + self.descr)
            update_statistics['total upload time'] += end - start
            update_statistics_lock.release()
        except Exception, error:
            print 'Error while publishing resource %s/%s, %s : %s' % (self.dest, self.doc_id, self.type, error)
        _release_connection(conn)
        max_threads.release()
        print_stats('%s/%s' % (self.dest, self.doc_id))

class Publisher:

    def publish(self, type, folder, url, title, content, descr, subject):
        checksum = md5.new(content.encode('utf-8')).hexdigest()
        dest = folder + '/' + url
        edate = datetime.ctime(datetime.now())
        if not md5digest.has_key(dest) or md5digest[dest] != checksum:
            publisher = PublishThread(folder, url,
                                      b64encode(title.encode('utf-8')),
                                      b64encode(tidy_dummy(content).encode('utf-8')),
                                      type, 'text/html', 'publish',
                                      b64encode(descr.encode('utf-8')), edate,
                                      b64encode(subject.encode('utf-8')),
                                      checksum)
            max_threads.acquire()
            publisher.start()

        # remove from digest 
        if dest in md5digest:
            del md5digest[dest]

########## Classes ##########
class CodesEtLoisObject:
    def __init__(self, parent, title, url, descr, level):
        self.parent = parent
        self.title = title
        self.url = url
        self.level = level
        self.keywords = []
        self.children = []
        self.content = ''
        self.histo = []
        self.notes = []

    def depth(self):
        """Returns maximal depth of this branch"""
        if self.children:
            return max([child.depth() for child in self.children]) + 1
        else:
            return 0

    def toc_item(self):
        """Returns ToC entry for this object"""
        return '<div class="Niveau%d"><a href="%s">%s</a>' % (self.level, 
                                                              self.url,
                                                              self.title)

    def dump_toc(self):
        """Returns ToC for this object and its sub-objects"""
        result = self.toc_item()

        in_articles = False
        for child in self.children:
            if isinstance(child, Article):
                if not in_articles:
                  result += '<div class="Articles">\n'
                  in_articles = True
            else:
                if in_articles:
                    result += '</div>\n'
                    in_articles = False
            result += child.dump_toc()

        if in_articles:
            result += '</div>\n'

        return result + '</div>\n'

    def dump_integral(self):
        """Returns integral text"""

        result = self.toc_item()

        in_articles = False
        for child in self.children:
            if isinstance(child, Article):
                if not in_articles:
                  result += '<div class="Articles">\n'
                  in_articles = True
            else:
                if in_articles:
                    result += '</div> <!-- Articles -->\n'
                    in_articles = False
            result += child.dump_integral()

        if in_articles:
            result += '</div> <!-- Articles -->\n'

        return result + '</div>\n'

    def set_urls(self):
        for child in self.children:
            child.set_urls()

        parent = self
        url = ''
        while parent.parent:
            url = parent.title + '-' + url
            parent = parent.parent
        self.url = utftourl(url)

    def set_keywords(self):
        for child in self.children:
            child.set_keywords()
        keywords = {}

        for child in self.children:
            for k in child.keywords:
                if k in keywords:
                    keywords[k] += 1
                else:
                    keywords[k] = 0
        
        k = keywords.keys()
        k.sort(lambda a,b: keywords[b] - keywords[a])
        self.keywords = k[0:10]

    def set_prev(self, prev = None):
        for child in self.children:
            prev = child.set_prev(prev)
        return prev
        
    def set_next(self, next = None):
        for child in self.children[::-1]:
            next = child.set_next(next)
        return next

    def preprocess(self):
        for child in self.children:
            child.preprocess()

    def get_folder(self):
        """Returns destination folder"""
        parent = self
        while parent.parent:
            parent = parent.parent
        return utftourl(parent.title)

    def get_root_title(self):
        """Returns root title"""
        parent = self
        while parent.parent:
            parent = parent.parent
        return parent.title

    def newchild(self, level, title):
        if level == 'Article':
            obj = Article(self, title, self.level + 1)
            self.children.append(obj)
        elif level == 'ArticleCodeDuTravail':
            obj = ArticleCodeDuTravail(self, title, self.level + 1)
            self.children.append(obj)
        elif level == 'TOC':
            obj = TOC(self, title, self.level + 1)
            self.children.append(obj)
        else:
            print "Warning: you must have done something wrong..."
            obj = None
        return obj

    def publish(self, publisher):
        pass

class Code (CodesEtLoisObject):
    def __init__(self, title):
        CodesEtLoisObject.__init__(self, None, title, '', '', 0)

    def toc_item(self):
        """Returns ToC entry for this object"""
        return """<div class="Niveau%d">
        <a href="%s">%s</a>
        <a href="texte-integral" title="[Texte Intégral]">
          <img src="/book_icon.gif" alt="[Texte Intégral]"/>
        </a>
        """ % (self.level,
               self.url,
               self.title)

    def set_urls(self):
        self.url = 'toc'
        for child in self.children:
            child.set_urls()

    def publish(self, publisher):
        publisher.publish('Md5TOC', self.get_folder(), 'texte-integral', self.title,
                          self.dump_integral(), self.get_root_title(), ';'.join(self.keywords) )

        publisher.publish('Md5TOC', self.get_folder(), 'toc', self.title,
                          self.dump_toc(), self.get_root_title(), ';'.join(self.keywords) )

        for child in self.children:
            child.publish(publisher)

class TOC (CodesEtLoisObject):
    def __init__(self, parent, title, level):
        CodesEtLoisObject.__init__(self, parent, title, '', '', level)

    def toc_item(self):
        """Returns ToC entry for this object"""
        if self.level < 2 or self.depth() == 1:
            return """<div class="Niveau%d">
            <a href="%s">%s</a>
            <a href="%s-texte-integral" title="[Texte Intégral]">
             <img src="/book_icon.gif" alt="[Texte Intégral]"/>
            </a>
            """ % (self.level,
                   self.url,
                   self.title,
                   self.url)
        else:
            return CodesEtLoisObject.toc_item(self)

    def set_urls(self):
        for child in self.children:
            child.set_urls()

        parent = self
        url = ''
        while parent.parent:
            url = parent.title + '-' + url
            parent = parent.parent
        self.url = utftourl('toc-' + url)

    def publish(self, publisher):
        parent = self.parent
        header = ''
        while parent:
            header = parent.toc_item() + header
            parent = parent.parent

        footer = ''
        parent = self.parent
        while parent:
            footer += '</div>\n'
            parent = parent.parent

        content = header + self.dump_toc() + footer

        publisher.publish('Md5TOC', self.get_folder(), self.url, self.title,
                          content, self.title + ' du ' + self.get_root_title(), ';'.join(self.keywords) )

        if self.level < 2 or self.depth() == 1:
            content = header + self.dump_integral() + footer
            publisher.publish('Md5TOC', self.get_folder(), self.url + '-texte-integral', self.title,
                              content, self.title + ' du ' + self.get_root_title() + ' - Texte Intégral',
                              ';'.join(self.keywords) )

        for child in self.children:
            child.publish(publisher)

class Article (CodesEtLoisObject):
    def __init__(self, parent, title, level):
        CodesEtLoisObject.__init__(self, parent, title, '', '', level)
        self.prev = None
        self.next = None

    def depth(self):
        """Returns maximal depth of this branch"""
        return 0

    def set_prev(self, prev = None):
        self.prev = prev
        return self

    def set_next(self, next = None):
        self.next = next
        return self

    def toc_item(self):
        """Returns ToC entry for this object"""
        return '<div class="Article"><a href="%s">%s</a>' % (self.url, self.title.replace(' ', '&nbsp;'))

    def dump_integral(self):
        histo = ''
        for h in self.histo:
            histo += '<div class="ArticleHisto">%s</div>\n' % h

        notes = ''
        for n in self.notes:
            notes += '<div class="ArticleNotes">%s</div>\n' % n

        content = '<p>' + '</p><p>'.join(self.content.strip().split('\n')) + '</p>'

        result = """
              <div class="Article">
                <div class="ArticleTitle"><a href="%s">%s</a></div>
                %s
                %s
                %s
              </div>
              """ % (self.url, self.title, histo, content, notes)

        return result

    def dump_article(self):
        parent = self.parent
        header = ''
        while parent:
            header = parent.toc_item() + header
            parent = parent.parent

        footer = ''
        parent = self.parent
        while parent:
            footer += '</div>\n'
            parent = parent.parent

        histo = ''
        for h in self.histo:
            histo += '<div class="ArticleHisto">%s</div>\n' % h

        notes = ''
        for n in self.notes:
            notes += '<div class="ArticleNotes">%s</div>\n' % n

        if self.prev:
            prev = """<span class="articlePrevious">
                     <a tabindex="2" href="%s"><span>Article précédent :&nbsp;</span>%s</a>
                   </span>""" % (self.prev.url, self.prev.title)
        else:
            prev = ''

        if self.next:
            next = """<span class="articleNext">
                     <a tabindex="2" href="%s"><span>Article suivant :&nbsp;</span>%s</a>
                   </span>""" % (self.next.url, self.next.title)
        else:
            next = ''

        content = '<p>' + '</p><p>'.join(self.content.strip().split('\n')) + '</p>'

        result = """
              %s
              <div class="Article">
                <div class="ArticleTitle">%s</div>
                %s
                %s
                %s
                 <div class="articleFooter">
                %s%s
                 </div> 
              </div>
              %s
              """ % (header, self.title, histo, content, notes, prev, next, footer)
        return result

    def set_urls(self):
        self.url = utftourl(self.title)

    def set_keywords(self):
        self.keywords = keywords_from_content(self.content)

    def publish(self, publisher):
        publisher.publish('Md5Article', self.get_folder(), self.url, self.title, 
                          self.dump_article(),
                          self.get_root_title() + ' : ' + self.title,
                          ';'.join(self.keywords) )

class ArticleCodeDuTravail (Article):
    def preprocess(self):
        title = self.title.replace('Article ', '').strip()
        if title in concordance_travail:
            old_url = utftourl('article-' + concordance_travail[title].decode('utf-8').split(',', 1)[0])
            self.notes.append("Remplace l'article <a href=\"/code-du-travail-ancien/%s\">%s</a>, de l'ancien code du travail." % 
                              (old_url, concordance_travail[title]))
        elif title in concordance_travail_reverse:
            new_url = utftourl('article-' + concordance_travail_reverse[title].decode('utf-8'))
            self.notes.append("Remplacé par l'article <a href=\"/code-du-travail/%s\">%s</a> dans le nouveau code du travail." %
                              (new_url, concordance_travail_reverse[title]))
        Article.preprocess(self)

class ParseCode(ContentHandler):
    def __init__(self, base_url = ''):
        self.root = None
        self.currentObject = None
        self.inHistoContent = False
        self.inNoteContent = False
        self.link = False
        self.inParContent = False
        self.title = ''
        self.text = ''
        self.notes = ''
        self.histo = ''

    def dump(self):
        self.root.dump()

    def dump_integral(self):
        return self.root.dump_integral()

    def startElement(self, name, attrs):
        """
        fonction appelee en debut de tag
        """
        # If it's in [ignore], ignore it
        if name in ignore:
            return

        self.title = attrs.get('title')
        if self.title: self.title = self.title.strip().strip('.')

        if name == 'Code':
            self.root = Code(self.title)
            self.currentObject = self.root

        elif name in ['niveau1', 'niveau2', 'niveau3', 'niveau4',
                      'niveau5', 'niveau6', 'niveau7', 'niveau8']:
            self.currentObject = self.currentObject.newchild('TOC', self.title)

        elif name == 'article':
            if self.root.title not in ['Code du travail', 'Code du travail (ancien)'] :
                self.currentObject = self.currentObject.newchild('Article', self.title)
            else:
                self.currentObject = self.currentObject.newchild('ArticleCodeDuTravail', self.title)
            self.inParContent = True
            # on remplit la BdD de *tous* les articles
            codeUrl = '/' + utftourl(self.root.title) + '/' + utftourl(self.currentObject.title)
            CodeTable(Url=codeUrl)

        elif name == 'Histo':
            self.inHistoContent = True

        elif name == 'Note':
            self.inNoteContent = True

        elif name == 'a':
            self.link = True
            self.link_text = ''
            self.link_url = attrs.get('href')
            # on remplit la BdD des references
            ownUrl =  '/' + utftourl(self.root.title) + '/' + utftourl(self.currentObject.title)
            Citation(fromUrl=ownUrl,
                     toUrl=self.link_url)
            #print 'from: ' + utftourl(self.currentObject.title) + ' to: ' + self.link_url

    def characters(self, elt_content):
        """
        fonction appelee si le parser rencontre des donnees dans un element
        si on est dans le corps ou dans une note, on remplie le fichier
        """
        if elt_content != '':
            if self.link == True:
                self.link_text += elt_content

            elif self.inHistoContent == True:
                self.histo += elt_content

            elif self.inNoteContent == True:
                self.notes += elt_content

            elif self.inParContent == True:
                self.text += elt_content

    def skippedEntity(self, name):
        if self.link == True:
            self.link_text += "&%s;" % name

        elif self.inHistoContent == True:
            self.histo += "&%s;" % name

        elif self.inNoteContent == True:
            self.notes += "&%s;" % name

        elif self.inParContent == True:
            self.text += "&%s;" % name

    def endElement(self, name):
        """
        fonction appelee en fin de tag
        on ferme les differents fichiers
        """
        if name in ignore:
            return

        if name == 'Code':
            pass

        elif name in ['niveau1', 'niveau2', 'niveau3', 'niveau4',
                      'niveau5', 'niveau6', 'niveau7', 'niveau8']:
            self.currentObject.content = self.text
            self.currentObject = self.currentObject.parent

        elif name == 'article':
            self.currentObject.content = self.text
            self.text = ''
            self.inParContent = False
            self.currentObject = self.currentObject.parent
            
        elif name == 'Histo':
            self.inHistoContent = False
            self.currentObject.histo.append(self.histo)
            self.histo = ''

        elif name == 'Note':
            self.inNoteContent = False
            self.currentObject.notes.append(self.notes)
            self.notes = ''

        elif name == 'a':
            self.link = False
            self.text += '<a href="%s">%s</a> ' % (self.link_url, self.link_text)


 
########## variables globales ##########
ignore = ['Articles']

########## dump DB ##########
def dump_dbs():
    # pour les references
    i_cit = 1
    count_cit = Citation.select().count()
    while i_cit <= count_cit:
        print Citation.get(i_cit)
        i_cit = i_cit + 1
    # pour les articles
    i_cod = 1
    count_cod = CodeTable.select().count()
    while i_cod <= count_cod:
        print CodeTable.get(i_cod)
        i_cod = i_cod + 1

def dump_info_dbs():
    cit = Citation.select().count()
    cod = CodeTable.select().count()
    print '%d articles et %d références' % (cod, cit)

########## MAIN ##########
# on ouvre le fichier en argument

file = open(sys.argv[1], "rb")

# on parse le xml produit par download
parser = make_parser()

# Tell the parser we are not interested in XML namespaces
parser.setFeature(feature_namespaces, 0)

# Create the handler
dh = ParseCode()

# Tell the parser to use our handler
parser.setContentHandler(dh)

# Parse the input
parser.parse(file)

file.close()


dh.root.set_urls()
dh.root.set_keywords()
dh.root.set_prev()
dh.root.set_next()
dh.root.preprocess()

plone = publisher_factory()
md5digest = pickle.loads(plone.md5digest(dh.root.get_folder()))

start = datetime.now()

dh.root.publish( Publisher() )

for i in range(0, MAX_THREADS):
    max_threads.acquire()

end = datetime.now()


_dur = end - start
duration = (_dur.days * 3600 * 24 + _dur.seconds) * 1000.0 + _dur.microseconds / 1000.0 + 0.1

print 'Total publish throughput : %f article/s\t %f KB/s\ttime : %.1fs' % (update_statistics['uploaded articles'] * 1000.0 / \
                                                                            duration,
                                                                        update_statistics['uploaded bytes'] * 1000.0 / 1024.0 / \
                                                                            duration,
                                                                        duration / 1000.0)

#dump_dbs()
dump_info_dbs()

# delete spurious documents
plone.deleteArticles( md5digest.keys() )


