#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# generates keyword_list.py from awstats data
#
import string, urllib, re, popen2, sys

bad_word_re = re.compile('code\
|août\
|article\
|autre\
|autres\
|avant\
|avec\
|avril\
|ceux\
|chapitre\
|click\
|contre\
|créé\
|dans\
|delà\
|dont\
|droit\
|décembre\
|décret\
|elle\
|entre\
|faire\
|fait\
|février\
|grande\
|janvier\
|juillet\
|juin\
|leur\
|leurs\
|livre\
|loi\\?\
|lois\\?\
|mai\
|mars\
|novembre\
|octobre\
|outre\
|paragraphe\
|partie\
|peut\
|pour\
|sans\
|section\
|selon\
|septembre\
|sont\
|sous\
|tenu\
|titre\
|tout\
|trop\
|veut\
|être\
'.decode('utf-8'))

class ASpell:
    expect_re = re.compile(r'& \w* \d+ \d+: ([^\s,]*)', re.LOCALE|re.UNICODE)

    def __init__(self):
        self._f = popen2.Popen3("aspell --encoding='utf-8' -l fr -d fr-80 -a")
        self._f.fromchild.readline() #skip the credit line

    def __call__(self, word):
        self._f.tochild.write(word.encode('utf-8')+ '\n')
        self._f.tochild.flush()
        s = self._f.fromchild.readline().decode('utf-8')

        if s[0] not in '+*&-#?':
            return word

        while self._f.fromchild.readline().strip() != '':
            pass #skip blank lines

        if s[0] != '&':
            return word

        m = ASpell.expect_re.match(s)

        if m:
            return m.group(1)
        else:
            return word


file = open('keywords', 'r')

spell = ASpell()

keywords = {}

#bad_char_re = re.compile('[0123456789\.|;\'-\u2019\u2013\u2022\u20AC\uba\ub4\ub0]')
bad_char_re = re.compile(u'[#0123456789\.|;`\'\-%%\u2013\u2022\u20AC\xb4]')

for line in file:
    if 'END_KEYWORDS' in line or 'BEGIN_KEYWORDS' in line:
        continue

    l = string.split(line, maxsplit = 1)

    key = string.lower(urllib.unquote(l[0]))

    try:
        count = l[1]
    except:
        continue

    try_latin1 = False
    try:
        ekey = key.decode('utf-8')
    except Exception, error:
        #print 'error decoding utf8 %s : %s' %(key, error)
        try_latin1 = True

    if try_latin1:
        try:
            ekey = key.decode('iso-8859-1')
        except Exception, error:
            print 'error decoding %s : %s' %(key, error)

    if len(ekey) < 4:
        continue

    if ekey[1] == u'\u2019':
        ekey = ekey[2:]

    if bad_char_re.search(ekey):        
        continue

    ekey = spell(ekey)

    # redo in case spell introduced unwanted chars
    if bad_char_re.search(ekey):
        continue

    if len(ekey) < 4:
        continue

    if bad_word_re.search(ekey):
        continue

    if not ekey in keywords:
        keywords[ekey] = 0

    keywords[ekey] += int(count)

klist = [(k,v) for k,v in keywords.items() if v >= 3]
klist.sort( lambda a, b: b[1] - a[1] )

listfile = open('keyword_list.py', 'w')
listfile.write('#-*- coding: utf-8 -*-\nkeyword_list = [')
i=0
for k,v in klist:
    if not i % 8:
        listfile.write('\n\t')
    listfile.write("u'%s'," % k.encode('utf-8'))
    i += 1
listfile.write(']\n')

#for (k, v) in klist:
    #line = '%s (%i)' % (k,v)
    #print line.encode('utf-8')
    
