import urllib, re, sys
from HTMLParser import HTMLParser
from xml.sax.saxutils import quoteattr
import threading

# args : LEGICODE, DATE, FILECODE
legicode = sys.argv[1]
date = sys.argv[2]
filecode = sys.argv[3]

article_fetchers = {}

url = "http://www.legifrance.gouv.fr/affichCode.do?cidTexte=" + legicode + "&dateTexte=" + date
ignore = ['script', 'noscript', 'br', 'img', 'li', 'ul'];
ignore_v2 = ['script', 'noscript', 'br', 'img', 'li', 'ul'];
ignore_classes = ['noType', 'date', 'data', 'fct_links_top', 'fct_links_bottom', 'showMenu'];
classes = ['TM1Code', 'TM2Code', 'TM3Code', 'TM4Code',
           'TM5Code', 'TM6Code', 'TM7Code', 'TM8Code', 'codeLienArt'];
toc = {'TM1Code': 'niveau1', 'TM2Code': 'niveau2',
       'TM3Code': 'niveau3', 'TM4Code': 'niveau4',
       'TM5Code': 'niveau5', 'TM6Code': 'niveau6',
       'TM7Code': 'niveau7', 'TM8Code': 'niveau8',
       'codeLienArt': 'Articles', 'a': 'a'};
#       'date': 'DATE ??', 'data': 'DONNEES ??'};

# TMxCode, x = 1..8
# codeLienArt
# <niveau1 title="blahblah">
#   <niveau2 title="article L627">
#     [texte de l'article...]

def protect_percent(text):
    return text.replace('%', '%%')

class ArticlesParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.inArticle = False
        self.inTitle = False
        self.inContent = False
        self.inNote = False
        self.inHisto = False
        self.inPar = False
        self.content = ''
        self.fullcontent = ''
        self.ignore = False

    def handle_startendtag(self, tag, attrs):
        pass

    def handle_starttag(self, tag, attrs):
        if tag in ignore_v2:
            pass
        else:
            if tag == 'div':
                for Nom, Valeur in attrs:
                    if (Nom == 'class' and Valeur == 'titreArt'):
                        if self.inArticle:
                            self.fullcontent += '</article>'
                        self.inArticle = True
                        self.inTitle = True
                        self.content = ''
                    if Valeur == 'histoArt':
                        # pour les notes de fin d'article
                        self.inHisto = True
                        self.histo_content = ''
                    elif Valeur == 'corpsArt':
                        # pour le corps
                        self.inContent = True
                        self.content = ''
                    elif Valeur == 'notaArt':
                        # pour les notes de fin d'article
                        self.inNote = True
                        self.nota_content = ''
            elif tag == 'a' and self.inTitle:
                self.ignore = True

    def handle_endtag(self, tag):
        if tag in ignore_v2:
            pass
        else:
            if tag == 'div':
                if self.inTitle:
                    self.fullcontent += '<article title=' + quoteattr(self.content.strip()) + '>'
                    self.content = ''
                    self.inTitle = False
                elif self.inContent:
                    # fin du corps
                    self.fullcontent += self.content
                    self.content = ''
                    self.inContent = False
                elif self.inHisto == True:
                    # fin d'article
                    self.fullcontent += '<Histo>' + self.histo_content.strip() + '</Histo>'
                    self.histo_content = ''
                    self.inHisto = False
                elif self.inNote == True:
                    # fin d'article
                    self.fullcontent += '<Note>' + self.nota_content.strip() + '</Note>'
                    self.nota_content = ''
                    self.inNote = False
            elif tag == 'a' and self.ignore:
                self.ignore = False
            elif tag == 'body' and self.inArticle:
                self.content = ''
                self.fullcontent += '</article>'

    def handle_data(self, data):
        if not self.ignore and True in [self.inTitle, self.inContent]:
            self.content += data
        elif self.inHisto:
            self.histo_content += data
        elif self.inNote:
            self.nota_content += data

fetched_url = {}
fetched_url_lock = threading.Lock()
urllib_lock = threading.Lock()
MAX_THREADS = 20
max_threads = threading.Semaphore(MAX_THREADS)

class FetcherThread(threading.Thread):
    def __init__(self, url, index):
        threading.Thread.__init__(self)
        self.url = url
        self.index = index

    def run(self):
        urllib_lock.acquire()
        self.articles = urllib.urlopen(self.url)
        urllib_lock.release()

        parsarticle = ArticlesParser()
        parsarticle.feed(self.articles.read())
        parsarticle.close()

        fetched_url_lock.acquire()
        fetched_url[self.index] = parsarticle.fullcontent
        fetched_url_lock.release()
        max_threads.release()

class ArticleFetcher:

    def download_article(self, url, index):
        max_threads.acquire()
        fetching_thread = FetcherThread(url, index)
        fetching_thread.start()

class CodeParser (HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.Flag = False
        self.FlagA = False
        self.Close = ["Code"]
        self.fullcontent = ''
        self.fetcher = ArticleFetcher()
        self.article_idx = 0

    def handle_startendtag(self, tag, attrs):
        pass

    def handle_starttag(self, tag, attrs):
        if tag in ['p', ignore]:
            pass
        else:
            for Nom, Valeur in attrs:
                if (Valeur in classes):
                    if (toc[Valeur] in self.Close):
                        # on cree une pile des balises ouvertes
                        # pour pouvoir depiler la derniere ouverte
                        while toc[Valeur] != self.Close[-1]:
                            self.fullcontent += "\t</" + self.Close.pop() + ">\n"
                        self.fullcontent += "\t</" + self.Close.pop() + ">\n"

                if (Nom == 'id') & (Valeur == 'titreTexte'):
                    # le titre du code
                    self.fullcontent += "<Code title="
                    self.Flag = True
                elif (Nom == 'class') & (Valeur == "codeLienArt"):
                    # une balise signalant un fichier d'articles
                    pass
                elif (Nom == 'class') & (not Valeur in ignore_classes):
                    # on ouvre une balise
                    self.fullcontent += "\t<" + toc[Valeur] + " title="
                    self.Close.append(toc[Valeur])
                    self.Flag = True
                elif (Nom == 'href'):
                    # un lien vers des articles ; on va les chercher
                    Lien = re.compile(r'^affichCode.do')
                    if Lien.search(Valeur):
                        self.FlagA = True
                        # la, il faut telecharger les articles.
                        art_url = "http://www.legifrance.gouv.fr/" + Valeur 

                        self.fetcher.download_article(art_url, 'Article%s' % self.article_idx)
                        self.fullcontent += '%%(Article%s)s' % self.article_idx

                        self.article_idx += 1
                        
                        self.Flag = True

    def handle_endtag(self, tag):
        if tag in ignore:
            pass
        elif self.Flag == True:
            # ??? la, je sais plus...
            if tag == 'a':
                self.fullcontent += "\t</" + tag + ">\n"

    def handle_data(self, title):
        if self.Flag == True:
            if self.FlagA == True:
                #self.fullcontent += "\t\t<data text=\"" + title + "\" />\n"
                self.FlagA = False
            else:
                # bonsoir m'sieur-dames, on ferme...
                self.fullcontent += quoteattr(protect_percent(title.strip())) + ">\n"
            self.Flag = False
       
content = urllib.urlopen(url);

parser = CodeParser();

parser.feed(content.read());

parser.close();

# wait all threads
for i in range(0,MAX_THREADS):
    max_threads.acquire()

file = open(filecode, "w")
file.write('<?xml version="1.0" encoding="UTF-8"?>\n')

file.write(parser.fullcontent % fetched_url)

# on ferme toutes les balises...
while parser.Close != []:
    file.write("\n</" + parser.Close.pop() + ">")

file.close()
