Utilisateur:Jona/process occ.py

#! /usr/bin/env python
# -*- coding: utf-8 -*-
 
#Create two pickles files (dictOcc and listOcc) that will be used by the script generate_table
 
import sys, re, getopt, os, shutil, pickle
 
#TODO: add an interactive mode
#TODO: make a more formal usage output and options manager
 
def extract_types(articles):
    """Extract type of words for each language in text
 
    Return a list from a filename"""
 
    llang = []
    ltype = []
    currentLang = ''
    fin = open(articles,'r')
 
 
    otherTypes = []
 
    for line in fin:
        #linelangslist = re.findall('\{\{=(\w{2,3})=\}\}', line) #Find langs (e.g. : {{=fr=}}) without ":" or "#"
        #linelangslist = re.findall('\{\{langue\|(.*?)\}\}', line) #Find langs (e.g. : {{=fr=}}) we try to find the shortest set between {{langue| and }}
		linelangslist = re.findall('\{\{langue\|(.*?)\|?\}\}', line)
		for s in linelangslist:
			if s.find("|") != -1: s = s[0:s.find("|")]
			llang.append(s)
			currentLang=s
			# print s
		lineType = re.findall('\{\{-(\S+)-(\|\S+){0,2}\}\}',line)
		for s in lineType:
			if s[0] == "nom":
					ltype.append([s,currentLang])
			elif s[0] == "adj":
					ltype.append([s,currentLang])
			elif s[0] == "adjectif":
					ltype.append([("adj",),currentLang])
			elif s[0] == "nom-pr":
					ltype.append([s,currentLang])
			elif s[0] == "pron":
					ltype.append([s,currentLang])
			elif s[0] == "pronom":
					ltype.append([("pron",),currentLang])
			elif s[0] == "verb":
					ltype.append([s,currentLang])
			elif s[0] == "verbe":
					ltype.append([("verb",),currentLang])
			elif s[0] == "adv":
					ltype.append([s,currentLang])
			elif s[0] == "adverbe":
					ltype.append([("adv",),currentLang])
			elif s[0].find("flex") != -1:
					ltype.append([("flex",),currentLang])
			elif (s[0].find("loc") == 0):
					ltype.append([("loc",),currentLang])
			else:
					if _debug:
							if not otherTypes.count(s[0]):
									otherTypes.append(s[0])
									#print s
 
		ebauche = re.findall('\{\{ébauche(\|\S{0,3})?\}\}',line) # Can be improved
		if ebauche:
			ltype.append([("stub",), currentLang])
			#if _debug:
					#print 'ébauche en %s'% ebauche
 
    if _debug:
        print "Types not computed :"
        print otherTypes
        print "\n"
    fin.close()
    apack = [ltype, llang]
    return apack
 
def compute_occ(extendedl):
    """Compute occurence of each item
 
    Return a dict"""
    occ ={}
    for e in extendedl:
        occ[e] = occ.get(e,0) + 1
    return occ
 
def compute_occ_subdict(extendedl,subdictTemplate={}):
    """Compute occurence of each item with a dictionary in a dictionary
 
    Return a dict of dicts"""
    occ = {}
    for e in extendedl:
        if occ.get(e[1],0) == 0: #Subdict not yet created
            occ[e[1]] = dict(subdictTemplate) #Do not copy the reference but create a new one from template
        occ[e[1]][e[0][0]] = occ.get(e[1],0).get(e[0][0],0) + 1
    return occ
 
def file_to_list(nameFile):
    f = open(nameFile,'r')
    l =[]
    for line in f:
        l.append(line.strip('\n'))
    f.close()
    return l
 
def list_to_file(l,nameFile):
    """Print a list to a file.
    For list not used in this module, the result can be unexpected"""
    if type(l) != list:
        print "Warning: The argument is not a list (list_to_file())"
        print "Unexpected behavior can occur"
    f = open(nameFile,'w')
    for s in l:
        if type(s) == tuple:
            f.write(str(s[0])+"\t"+str(s[1])+'\n')
        elif type(s) == str:
            f.write(s+'\n')
        else:
            print "This format is not supported"
    f.close()
 
def dict_to_file(d,nameFile):
    """Print a dict to a file.
    For dict not used in this module, the result can be unexpected"""
    if type(d) != dict:
        print "Warning: The argument is not a dict (dict_to_file())"
        print "Unexpected behavior can occur"
    f = open(nameFile,'w')
    for s in d:
        f.write(s+' '+d[s]+'\n')
    f.close()
 
def make_diff(l1,l2):
    """Make the diff (l2 - l1)
 
    Return a list"""
    s1 = set(l1)
    s2 = set(l2)
 
    sdiff = s2 - s1
    ldiff = list(sdiff)
    return ldiff
 
def retrieve_occ(l,occ):
    """Join a list of items and number of occurence of those items given in a dict (occ)
 
    Return a list of tuple (occurence, item)"""
    locc = []
    for e in l:
        n = occ.get(e,0)
        locc.append((n, e))
    return locc
 
 
def usage():
    sys.stderr.write("""Options available are\n
-h --help       Show this help
-v --verbose    Enter verbose mode
-i --input      Specify an input directory
-o --output     Specify an output filename ("langsTableCol" by default) [OBSOLETE, it write now into "listOcc" and "dicOcc"]
-d              (nothing changing)\n""")
 
 
def main(argv):
 
    global _verbose
    global _debug
    _verbose = 0
    _debug = 0
 
    inf = '/out'
    titlef = '/outTitle'
    inputdir = "."
    outf = 'langsTableCol'
 
    try:
        opts, args = getopt.getopt(argv, "hvi:o:d", ["help", "verbose", "input=", "output="])
    except getopt.GetoptError:
        sys.stderr.write("Illegal argument\n")
        usage()
        sys.exit(2)
 
    for opt, arg in opts:
        if opt in ("-h", "--help"):
            usage()
            sys.exit(0)
        elif opt == '-d':
            _debug = 1
            #TODO: put debug condition
        elif opt in ("-v", "--verbose"):
            _verbose = 1
            #TODO: put verbose condition
        elif opt in ("-o", "--output"):
            #TODO: verify that arg is a directory path
            outf = arg
        elif opt in ("-i", "--input"):
            #TODO: verify that arg is a directory path
            inputdir = arg
 
        # It will look if there are old files that has to be backuped
    if os.path.isfile("dictOcc") or os.path.isfile("listOcc"):
        doicontinue=raw_input("Ouput file(s) already exist, overwrite ? [C/y/n] (by default copy the old file to *.old; y : to overwrite; n : to abort): ")
        if doicontinue.lower() == "y":
                pass
        elif doicontinue.lower() == "n":
                print "operation aborted by user"
                sys.exit(2)
        else :
                if os.path.isfile("dictOcc"):
                        shutil.copyfile("dictOcc", "dictOcc.old")
                if os.path.isfile("listOcc"):
                        shutil.copyfile("listOcc", "listOcc.old")
                print "Old files copied to *.old"
 
    if _verbose:
        print 'Extracting languages from "%s%s"...'% (inputdir, inf)
    print "It can now take several minutes..."
    llinks = extract_types(inputdir+inf)
    if _verbose:
        print "Languages extracted..."
 
    langsOcc = compute_occ(llinks[1])
    if _verbose:
        print "Languages computed..."
 
    typeOcc = compute_occ_subdict(llinks[0],{'nom':0,'nom-pr':0,'adj':0,'verb':0,'adv':0,'flex':0,'loc':0, 'stub':0}) # Create fields to avoid error when a lang missed one
    if _verbose:
        print "Type of words computed..."
    if _debug:
        print 'fr : ',typeOcc['fr']
        print 'nl : ',typeOcc['nl']
        print 'ru : ',typeOcc['ru']
 
    fpickle = open('dictOcc','w')
    p=pickle.Pickler(fpickle)
    p.dump(typeOcc)
    fpickle.close()
 
    fpickle2 = open('listOcc','w')
    p2=pickle.Pickler(fpickle2)
    p2.dump(langsOcc)
    fpickle.close()
 
    if _verbose:
        print "Pickle files written..."
 
if __name__ == '__main__':
    main(sys.argv[1:])
 
    #To estimate the time
    ##import timeit
    ##t = timeit.Timer("main(sys.argv[1:])", "from __main__ import main")
    ##print t.repeat(3,5)