# -*- coding: utf-8 -*-
import re, sys
from CodeMapping import CodeMapping
from utftourl import utftourl
from string import lower

from database import DBArticleTable, DBCodeTable

#
# re to detect codes
#

#code_names = CodeMapping.keys()
#code_names.sort(lambda a,b: b < a)

def protect_group_name(name):
    result = utftourl(name)
    return result.replace('-', '_')

_code_names_groups = ['(?P<%s>%s)' % (protect_group_name(k), k) 
                      for k,v in CodeMapping.items()]
code_names_group_to_url = {}
for k,v in CodeMapping.items():
    code_names_group_to_url[protect_group_name(k)] = v

code_names_re_text = '|'.join(_code_names_groups)
code_names_re = re.compile(code_names_re_text, re.UNICODE | re.DOTALL | re.IGNORECASE)

#
# re to detect articles
#
article_re_text = r'''
(
 (
  (?P<prefix>
   ([lrdaLRDA]|l(\.|\s)*o|L(\.|\s)*O)\*?)
 )?
 \s*(\.|\*)*\s*
 (?P<number>(\s?-?[\d^]+)+)
 \s*
 (?P<suffix>
  er|bis|ter|quater|
  (
   (
    (
     quinqu|sex|sept|oct|non)ies
   )|
   (
    (
     un|duo|ter|quater|quin|sex|sept|octo|novo
    )?
    (decies|vicies|tricies)
   )
  )
 )?
 [-\s]*
 (?P<extra>
  ([A-Z0-9]{0,2})?
 )
)
'''

article_re = re.compile(article_re_text, 
                        re.MULTILINE | re.DOTALL | re.VERBOSE)

article_trigger_re = re.compile(r'''
(?:
 (
  article
  (?:
    s
  )?
  \s+
  (''' + \
   article_re_text \
 + r'''
   (?:
    ou|et|à|[\s\W]|\(.*?\))*
  )+
 )
)
''', re.IGNORECASE | re.MULTILINE | re.DOTALL | re.VERBOSE)

#
# re to detect end-of-sentence
#
eos_re_text=r'\.|;'
eos_re = re.compile(eos_re_text,
                    re.IGNORECASE | re.MULTILINE | re.DOTALL | re.VERBOSE)


#
# re to check false-positives
#
veto_re_text=u"(?:la\s+loi)|(?:ordonnance)|(?:décret)|(?:la\s+circulaire)|(?:la\s+directive)"
veto_re = re.compile(veto_re_text,
                     re.IGNORECASE | re.MULTILINE | re.DOTALL | re.VERBOSE | re.UNICODE)

def normalize_article_reference(title):
    """Utility function: return normalized article title
    e.g. : L612-3 bis -> l-612-3-bis"""
    m = article_re.search(title)
    if m:
        _parts = m.groupdict()
        _ref = ''
        if _parts['prefix']:
            _ref += r'%(prefix)s-'
        _ref += r'%(number)s'
        if _parts['suffix']:
            _ref += r'-%(suffix)s'
        if _parts['extra']:
            _ref += r'-%(extra)s'
        return lower( _ref % m.groupdict() ).strip()
    else:
        return None

def parse_article(content, start, end):
    """
    cut a list of articles into individual matches
    'end' is just a hint to limit first regexp searches.
    """
    start_index = start
    articles = []
    m = article_re.search(content[start_index:end+8]) # search some chars after 'end'
    while m:

        # strip whitespaces
        _end = start_index + m.end(0)
        if end < _end: # extand searched text
            end=_end
        _start = start_index + m.start(0)
        while content[_end-1].isspace():
            _end -= 1
        while content[_start].isspace():
            _start += 1

        # register match        
        if _start <= _end and content[_start:_end]:
            articles.append( MatchArticle(_start, _end) )

        # iterate
        start_index = _end
        m = article_re.search(content[_end:end+8]) # search some chars after 'end'

        # lookup end-of-sentence between matches.
        if m:
            eos = eos_re.search(content[_end:_end + m.start(0)])
            if eos:
                break

    #print 'lookup articles in %s -> %s' % (content[start:end], [ content[a.start:a.end] for a in articles ])

    return articles

def lookup_article_references(content):

    # look for "article" reference, roughly
    m = article_trigger_re.search(content)
    start_index = 0
    articles = []
    while m:

        # strip whitespaces
        _end = start_index + m.end(0)
        _start = start_index + m.start(0)
        while content[_end-1].isspace():
            _end -= 1
        while content[_start].isspace():
            _start += 1

        # look for article reference, more precisely
        article_list = parse_article(content, _start, _end)

        # register match
        if article_list:
            articles.extend(article_list)

        # iterate
        start_index = _end
        m = article_trigger_re.search(content[_end:])

    return articles

def lookup_code_references(content):

    # look for references to codes
    m = code_names_re.search(content)
    start_index = 0
    codes = []
    while m:

        # strip whitespaces
        _end = start_index + m.end(0)
        _start = start_index + m.start(0)
        while content[_end-1].isspace():
            _end -= 1
        while content[_start].isspace():
            _start += 1
        text = content[_start:_end]

        # lookup which code matched
        url = None
        for k,v in m.groupdict().items():
            if v:
                url = code_names_group_to_url[k]
                break

        # register match
        if text and url:
            codes.append( MatchCode(_start, _end, url) )

        # iterate
        start_index = _end
        m = code_names_re.search(content[_end:])

    return codes

def lookup_end_of_sentences(content, articles, codes):
    """Tag end-of-sentence after each articles and code,
    for easier traversal of fsm."""

    # aggregate both sets
    fullset = articles
    fullset.extend(codes)
    fullset.sort(lambda a,b: a.end - b.end)

    index = 0
    while index < len(fullset) - 1:

        if fullset[index].type == Match.END_OF_SENTENCE:
            index += 1
            continue

        start_index = fullset[index].end
        m = eos_re.search(content[start_index:fullset[index+1].start])
        if m:
            fullset.insert(index + 1, 
                           MatchEndOfSentence(start_index + m.start(0)))
            index += 2
        else:
            index += 1

    # append eos at the end of content, if needed
    if fullset and fullset[-1].type != Match.END_OF_SENTENCE:
        fullset.append(MatchEndOfSentence(len(content)-1))
    return fullset

def hinted_parse_content(content, fsm, default_code):
    """
    Return a list of article matches in content,
    using fsm as crude Finite State Automata to help analysis.
    """
    def _get_content(m):
        return content[m.start:m.end]

    def _next_code_ref_in_sentence(fsm, index):
        """Return, if any, next reference to a code until next end-of-sentence."""

        # iterate over fsm from match to match until we find
        # either a reference to a code or an end-of-sentence.
        while fsm[index].type not in [Match.CODE, Match.END_OF_SENTENCE]:
            index += 1

        if fsm[index].type == Match.END_OF_SENTENCE:
            # no further reference to a code
            return None

        # no end-of-sentence encountred before next code reference:
        # code reference follows in sentence at fsm[index]
        return index
        
    def _next_eos_in_content(fsm, index):
        """Return next eos in fsm, starting at index"""
        while fsm[index].type != Match.END_OF_SENTENCE:
            index += 1
        return fsm[index].start

        

    results = []
    suspects = [ default_code ]
    index = 0
    next_eos = 0
    while index < len(fsm):
        """iterate over each match in content"""

        if fsm[index].type == Match.CODE:
            name = utftourl(_get_content(fsm[index]))
            if name not in suspects:
                suspects.append( name )
        elif fsm[index].type == Match.ARTICLE:
            _code_ref = _next_code_ref_in_sentence(fsm, index + 1)
            if _code_ref:
                name = utftourl(_get_content(fsm[_code_ref]))
                if name not in suspects:
                    suspects.append( name )

            next_eos = _next_eos_in_content(fsm, index + 1)
            if not veto_re.search(content[fsm[index].start:next_eos]):
                
                _article = None
                title = normalize_article_reference(content[fsm[index].start:fsm[index].end])
                

                for _code_url in suspects[::-1]:
                    peeps = list(DBCodeTable.selectBy(url=_code_url))
                    if peeps:
                        _code_id = peeps[0].id
                        peeps = list(DBArticleTable.selectBy(normalizedTitle=title,
                                                             code=_code_id))
                        if peeps:
                            # found a candidate in _code_url !
                            _article = peeps[0]
                            break

                # TODO: refine this:
                #if not peeps:
                #    peeps = list(DBArticleTable.selectBy(normalizedTitle=title))
                #    if peeps:
                #        # found a candidate in another code !
                #        _article = peeps[0]
                #        print '2:found in %s' % _article.code.title
                
                if _article:
                    #print 'match found for "%s" : %s du %s' % (_get_content(fsm[index]),
                    #                                           _article.title,
                    #                                           _article.code.title)
                                                         
                    results.append( MatchLink(fsm[index].start,
                                              fsm[index].end,
                                              _get_content(fsm[index]).replace('\n', ' '),
                                              _article.code.title,
                                              _article.url) )
                else:
                    #print 'no match found for "%s"' % _get_content(fsm[index])
                    #print '\t looked in %s' % (suspects)
                    pass

        index += 1

    return results

class Match:
    """Match object:
    start: index of start-of-match
    end:  index of end-of-match
    type: CODE, ARTICLE or LINK, whatever applies
    """
    CODE = 0
    ARTICLE = 1
    LINK = 2
    END_OF_SENTENCE = 3

    def __init__(self, start, end):
        self.start = start
        self.end = end

class MatchCode(Match):
    type = Match.CODE
    def __init__(self, start, end, url=''):
        Match.__init__(self, start, end)
        self.url = url

class MatchArticle(Match):
    type = Match.ARTICLE
    def __init__(self, start, end, text='', code_hint=''):
        Match.__init__(self, start, end)
        self.text = text.replace('\n', ' ').strip()
        self.code_hint = code_hint.replace('\n', ' ').strip()

class MatchLink(Match):
    type = Match.LINK
    def __init__(self, start, end, text='', code='', url=''):
        Match.__init__(self, start, end)
        self.url = url
        self.code = code
        self.text = text

class MatchEndOfSentence(Match):
    type = Match.END_OF_SENTENCE
    def __init__(self, start):
        self.start = start
        self.end = start

def print_tagged_text(content, fsm):
    if not fsm:
        return content
    i = 0
    result = ''
    for a in fsm[:-2]:
        result += content[i:a.start]
        result += '<%s>%s</%s>' % (a.type,
                               content[a.start:a.end],
                               a.type)
        i = a.end
    a=fsm[-1]
    result += '<%s>%s</%s>%s' % (a.type,
                                 content[a.start:a.end],
                                 a.type,
                                 content[a.end:])
    print result
    

def add_href(content, default_code=''):

    #
    # part 1: lookup references to articles and codes in content
    # build lists of Match objects that index match positions in content
    # 
    articles = lookup_article_references(content)
    #for a in articles:
    #    print '"%s"' % content[a.start:a.end].replace('\n', '').strip()

    codes = lookup_code_references(content)
    #for c in codes:
    #    print '"%s"' % content[c.start:c.end].replace('\n', ' ').strip()

    #
    # part 2: aggregates informations
    #  reorder Match object lists by start-index,
    #  insert after each match a END_OF_SENTENCE match.
    #  fsm is an ordered list of matches (articles and codes)
    #
    fsm = lookup_end_of_sentences(content, articles, codes)

    #print_tagged_text(content, fsm)

    
    #
    # part 3: build result, using fsm as Finite State Automata.
    #
    links = hinted_parse_content(content, fsm, utftourl(default_code))

    #
    # part 4: insert href in place of Match objects in links
    #
    default_code_url = utftourl(default_code)
    start_index = 0
    result = ''
    for link in links:
        # copy up to next match
        result += content[start_index:link.start]
        result += r' <a href="%s" title="article %s du %s">%s</a> ' % (link.url,
                                                                       link.text,
                                                                       link.code,
                                                                       link.text)
        start_index = link.end
    result += content[start_index:]

    return result


if __name__ == '__main__':
    f = open(sys.argv[1], 'r')
    content = f.read().decode('utf-8')

    result = add_href(content, sys.argv[2].decode('utf-8'))

    print result

