#! /usr/bin/env python
# -*- coding: utf-8 -*-
#Create two pickles files (dictOcc and listOcc) that will be used by the script generate_table
import sys, re, getopt, os, shutil, pickle
#TODO: add an interactive mode
#TODO: make a more formal usage output and options manager
def extract_types(articles):
"""Extract type of words for each language in text
Return a list from a filename"""
llang = []
ltype = []
currentLang = ''
fin = open(articles,'r')
otherTypes = []
for line in fin:
#linelangslist = re.findall('\{\{=(\w{2,3})=\}\}', line) #Find langs (e.g. : {{=fr=}}) without ":" or "#"
#linelangslist = re.findall('\{\{langue\|(.*?)\}\}', line) #Find langs (e.g. : {{=fr=}}) we try to find the shortest set between {{langue| and }}
linelangslist = re.findall('\{\{langue\|(.*?)\|?\}\}', line)
for s in linelangslist:
if s.find("|") != -1: s = s[0:s.find("|")]
llang.append(s)
currentLang=s
# print s
lineType = re.findall('\{\{-(\S+)-(\|\S+){0,2}\}\}',line)
for s in lineType:
if s[0] == "nom":
ltype.append([s,currentLang])
elif s[0] == "adj":
ltype.append([s,currentLang])
elif s[0] == "adjectif":
ltype.append([("adj",),currentLang])
elif s[0] == "nom-pr":
ltype.append([s,currentLang])
elif s[0] == "pron":
ltype.append([s,currentLang])
elif s[0] == "pronom":
ltype.append([("pron",),currentLang])
elif s[0] == "verb":
ltype.append([s,currentLang])
elif s[0] == "verbe":
ltype.append([("verb",),currentLang])
elif s[0] == "adv":
ltype.append([s,currentLang])
elif s[0] == "adverbe":
ltype.append([("adv",),currentLang])
elif s[0].find("flex") != -1:
ltype.append([("flex",),currentLang])
elif (s[0].find("loc") == 0):
ltype.append([("loc",),currentLang])
else:
if _debug:
if not otherTypes.count(s[0]):
otherTypes.append(s[0])
#print s
ebauche = re.findall('\{\{ébauche(\|\S{0,3})?\}\}',line) # Can be improved
if ebauche:
ltype.append([("stub",), currentLang])
#if _debug:
#print 'ébauche en %s'% ebauche
if _debug:
print "Types not computed :"
print otherTypes
print "\n"
fin.close()
apack = [ltype, llang]
return apack
def compute_occ(extendedl):
"""Compute occurence of each item
Return a dict"""
occ ={}
for e in extendedl:
occ[e] = occ.get(e,0) + 1
return occ
def compute_occ_subdict(extendedl,subdictTemplate={}):
"""Compute occurence of each item with a dictionary in a dictionary
Return a dict of dicts"""
occ = {}
for e in extendedl:
if occ.get(e[1],0) == 0: #Subdict not yet created
occ[e[1]] = dict(subdictTemplate) #Do not copy the reference but create a new one from template
occ[e[1]][e[0][0]] = occ.get(e[1],0).get(e[0][0],0) + 1
return occ
def file_to_list(nameFile):
f = open(nameFile,'r')
l =[]
for line in f:
l.append(line.strip('\n'))
f.close()
return l
def list_to_file(l,nameFile):
"""Print a list to a file.
For list not used in this module, the result can be unexpected"""
if type(l) != list:
print "Warning: The argument is not a list (list_to_file())"
print "Unexpected behavior can occur"
f = open(nameFile,'w')
for s in l:
if type(s) == tuple:
f.write(str(s[0])+"\t"+str(s[1])+'\n')
elif type(s) == str:
f.write(s+'\n')
else:
print "This format is not supported"
f.close()
def dict_to_file(d,nameFile):
"""Print a dict to a file.
For dict not used in this module, the result can be unexpected"""
if type(d) != dict:
print "Warning: The argument is not a dict (dict_to_file())"
print "Unexpected behavior can occur"
f = open(nameFile,'w')
for s in d:
f.write(s+' '+d[s]+'\n')
f.close()
def make_diff(l1,l2):
"""Make the diff (l2 - l1)
Return a list"""
s1 = set(l1)
s2 = set(l2)
sdiff = s2 - s1
ldiff = list(sdiff)
return ldiff
def retrieve_occ(l,occ):
"""Join a list of items and number of occurence of those items given in a dict (occ)
Return a list of tuple (occurence, item)"""
locc = []
for e in l:
n = occ.get(e,0)
locc.append((n, e))
return locc
def usage():
sys.stderr.write("""Options available are\n
-h --help Show this help
-v --verbose Enter verbose mode
-i --input Specify an input directory
-o --output Specify an output filename ("langsTableCol" by default) [OBSOLETE, it write now into "listOcc" and "dicOcc"]
-d (nothing changing)\n""")
def main(argv):
global _verbose
global _debug
_verbose = 0
_debug = 0
inf = '/out'
titlef = '/outTitle'
inputdir = "."
outf = 'langsTableCol'
try:
opts, args = getopt.getopt(argv, "hvi:o:d", ["help", "verbose", "input=", "output="])
except getopt.GetoptError:
sys.stderr.write("Illegal argument\n")
usage()
sys.exit(2)
for opt, arg in opts:
if opt in ("-h", "--help"):
usage()
sys.exit(0)
elif opt == '-d':
_debug = 1
#TODO: put debug condition
elif opt in ("-v", "--verbose"):
_verbose = 1
#TODO: put verbose condition
elif opt in ("-o", "--output"):
#TODO: verify that arg is a directory path
outf = arg
elif opt in ("-i", "--input"):
#TODO: verify that arg is a directory path
inputdir = arg
# It will look if there are old files that has to be backuped
if os.path.isfile("dictOcc") or os.path.isfile("listOcc"):
doicontinue=raw_input("Ouput file(s) already exist, overwrite ? [C/y/n] (by default copy the old file to *.old; y : to overwrite; n : to abort): ")
if doicontinue.lower() == "y":
pass
elif doicontinue.lower() == "n":
print "operation aborted by user"
sys.exit(2)
else :
if os.path.isfile("dictOcc"):
shutil.copyfile("dictOcc", "dictOcc.old")
if os.path.isfile("listOcc"):
shutil.copyfile("listOcc", "listOcc.old")
print "Old files copied to *.old"
if _verbose:
print 'Extracting languages from "%s%s"...'% (inputdir, inf)
print "It can now take several minutes..."
llinks = extract_types(inputdir+inf)
if _verbose:
print "Languages extracted..."
langsOcc = compute_occ(llinks[1])
if _verbose:
print "Languages computed..."
typeOcc = compute_occ_subdict(llinks[0],{'nom':0,'nom-pr':0,'adj':0,'verb':0,'adv':0,'flex':0,'loc':0, 'stub':0}) # Create fields to avoid error when a lang missed one
if _verbose:
print "Type of words computed..."
if _debug:
print 'fr : ',typeOcc['fr']
print 'nl : ',typeOcc['nl']
print 'ru : ',typeOcc['ru']
fpickle = open('dictOcc','w')
p=pickle.Pickler(fpickle)
p.dump(typeOcc)
fpickle.close()
fpickle2 = open('listOcc','w')
p2=pickle.Pickler(fpickle2)
p2.dump(langsOcc)
fpickle.close()
if _verbose:
print "Pickle files written..."
if __name__ == '__main__':
main(sys.argv[1:])
#To estimate the time
##import timeit
##t = timeit.Timer("main(sys.argv[1:])", "from __main__ import main")
##print t.repeat(3,5)