Aller au contenu

MediaWiki:Gadget-translation editor.js/Statistiques/code/partie 2

Définition, traduction, prononciation, anagramme et synonyme sur le dictionnaire libre Wiktionnaire.
#!/usr/bin/env python3
import os
import re
import time
import pywikibot
import ast
from subprocess import call

# Copie de [[MediaWiki:Gadget-translation editor.js/langues.json]] (la liste étant préfixée de languages =)
from languages_list import languages

site = pywikibot.Site(code='fr', fam='wiktionary')

# Dossier qui contient les résultats générés précédemment et notamment le fichier
# stats-trads-diffs-to-check.txt qui contient les infos sur les diffs à analyser.
folder = os.path.dirname(__file__)
# Fichiers de sortie
file_langs = 'stats-langs-after-diffs.txt'
file_dates = 'stats-months-after-diffs.txt'
file_contributors = 'stats-trads-contributors-after-diffs.txt'
file_ips = 'stats-trads-ips-after-diffs.txt'
  
def first_n_items_dict(d, n, first_index):
  '''Returns a list of n top-valued items from index first_index'''
  return sorted(d.items(), key=lambda x: x[1], reverse=True)[first_index:first_index+n]
  
def dict_from_file(file, folder=folder):
  with open(os.path.join(folder, file), encoding='utf-8') as f:
    dict_results = ast.literal_eval(f.read())
  return dict_results
  
def generate_graph(results, file_graph='graph_template.txt', cat='langs', label='lang'):
  '''
  @param results: an object containing the results
  @type results: str representing a file,
                or dict or list of tuples
  '''
  input_file = os.path.join(folder, file_graph)
  output_file = os.path.join(folder, 'graph-{}.txt'.format(cat))
  if isinstance(results, str):
    with open(os.path.join(folder, results)) as f:
      results = ast.literal_eval(f.read())
  with open(input_file, 'r', encoding='utf-8') as f:
    content = f.read()
    specific_results = ''
    if isinstance(results, dict):
      for k in results:
        specific_results += '        {{"{}": "{}", "amount": {} }},\n'.format(label, k, str(results[k]))
    elif isinstance(results, list):
      for item in results:
        specific_results += '        {{"{}": "{}", "amount": {} }},\n'.format(label, item[0], str(item[1]))
    else:
      print('results objects should be either a string, a list or a dict, but it is a {}'.format(type(results))); return
    specific_results = specific_results[:-2] # removing last ",\n"
    content = content.replace('__TO_REPLACE__', specific_results)
    content = content.replace('__LABEL__', label)
    oblique_labels = '__OBLIQUE_LABELS__'
    if cat == 'dates':
        content = content.replace(oblique_labels, ', "properties": { "labels": {"angle": {"value": -45}, "dx": {"value": -20} } } ')
    else:
      content = content.replace(oblique_labels, '')
  with open(output_file, 'w', encoding='utf-8') as f:
    f.write(content)
  print('Graph saved to {}'.format(output_file))
    
def generate_langs_graph(nbLangs=15, first_index=0, filename_var='langs'):
    d = dict_from_file(file_langs)
    data = first_n_items_dict(d, nbLangs, first_index)
    generate_graph(data, file_graph='graph_template.txt', cat=filename_var, label='lang')
    
def generate_dates_graph():
    generate_graph(file_dates, file_graph='graph_template.txt', cat='dates', label='date')
    
def results_to_wikitable(dict, header1='Langues', header2='Traductions ajoutées'):
  wikitext = '{{| class="wikitable sortable mw-collapsible"\n! {} !! {}'.format(header1, header2)
  for k in dict:
    wikitext += '\n|-\n| {} || {}'.format(k, str(dict[k]))
  wikitext += '\n|}'
  return wikitext
  
def raw_results_to_wikitable(filename, dest=None, header1='Langues', header2='Traductions ajoutées'):
  if dest is None:
    raise Exception('A destination file must be provided')
  input_file = os.path.join(folder, filename)
  output_file = os.path.join(folder, dest)
  with open(input_file, encoding='utf-8') as f:
    for line in f:
      dict_results = ast.literal_eval(line)
      break
  with open(output_file, 'w+', encoding='utf-8') as f:
    f.write(results_to_wikitable(dict_results, header1, header2))
  print('Wikitable saved to {}'.format(output_file))

def count_trads_per_diff():
  '''
  Génère des statistiques d'ajout de trads lorsque les résumés d'édition sont tronqués
  et necessitent une analyse des diffs
  '''
  stats_langs = {}
  stats_dates = {}
  stats_contribs = {}
  stats_ips = {}
  
  cpt = 0
  cpt_trads = 0
  count_contributors = 0
  count_ips = 0

  with open(os.path.join(folder, "stats-langs.txt"), encoding="utf-8") as f:
    for line in f:
      match = re.search("(.+) : (\d+)", line)
      if match is None:
        continue
      stats_langs[match[1]] = int(match[2])
  
  with open(os.path.join(folder, "stats-months.txt"), encoding="utf-8") as f:
    for line in f:
      match = re.search("(.+) : (\d+)", line)
      stats_dates[match[1]] = int(match[2])
  
  with open(os.path.join(folder, "stats-trads-contributors.txt"), encoding="utf-8") as f:
    for line in f:
      match = re.search("(.+) : (\d+)", line)
      if match is None:
        continue
      stats_contribs[match[1]] = int(match[2])
      
  with open(os.path.join(folder, "stats-trads-ips.txt"), encoding="utf-8") as f:
    for line in f:
      match = re.search("(.+) : (\d+)", line)
      if match is None:
        continue
      stats_ips[match[1]] = int(match[2])
      
  with open(os.path.join(folder, "stats-trads-diffs-to-check.txt"), encoding="utf-8") as f:
    for line in f:
      # line = "title=lire;prev_rev_id=18770633;rev_id=18908602;contrib=Test;is_ip=false;date=2015-01"
      # title = line.split(';')[0].split('=')[1]
      from_rev = line.split(';')[1].split('=')[1]
      to_rev = line.split(';')[2].split('=')[1]
      contrib = line.split(';')[3].split('=')[1]
      if line.split(';')[4].split('=')[1] == 'true':
        is_ip = True
      else:
        is_ip = False
      date = line.split(';')[5].split('=')[1].strip()
      diff_html = site.compare(old=int(from_rev), diff=int(to_rev))
      
      # The Mediawiki diff algorithm sometimes shows existing translations as removed lines
      # (cf. eg. https://fr.wiktionary.org/w/index.php?title=boto&diff=prev&oldid=25504237),
      # and then shows them back in the added lines.
      # Hence we collect all language codes in the added translations (A), and in the removed translations (B),
      # and we do the diff, by substracting B from A (as multisets).
      # This works as translation_editor is only used to add translations.
      # Note: the previously used algorithm ([[Special:Permalink/33473930]]) was almost twice as efficient,
      # and had an error margin of just about ~100 entries out of 77,500, that is a ~0.13 % error margin.
      trads_removed = re.findall('<td class="diff-deletedline diff-side-deleted"><div>(.+)</div></td>', diff_html)
      trads_added = re.findall('<td class="diff-addedline diff-side-added"><div>(.+)</div></td>', diff_html)
      codes_added = []
      codes_removed = []
      for t in trads_added:
        codes_added += re.findall('{{trad(?:<ins class=\"diffchange diffchange-inline\">)?[+-]{0,2}(?:</ins>)?\|(?:<ins class="diffchange diffchange-inline">)?([^|<]+)(?:</ins>)?\|', t)
      for t in trads_removed:
        codes_removed += re.findall('{{trad(?:<del class=\"diffchange diffchange-inline\">)?[+-]{0,2}(?:</del>)?\|(?:<del class="diffchange diffchange-inline">)?([^|<]+)(?:</del>)?\|', t)
      # Now that we collected the codes for all removed translations, we remove one instance
      # of each code in the added translations.
      for code in codes_removed:
        codes_added.remove(code)
      for code in codes_added:
        if code in languages:
          lang_name = languages[code]
        elif code in languages['redirects']:
          lang_name = languages[languages['redirects'][code]]
        else:
          # Les codes langue non répertoriés dans la liste des langues sont ignorés
          # 1 cas le 3/12/2023 : le code zh-tc supprimé depuis
          print('CODE ' + code + ' NOT FOUND (and translation ignored) - ' + line.strip())
          continue
        if lang_name:
          if lang_name in stats_langs:
            stats_langs[lang_name] += 1
          else:
            stats_langs[lang_name] = 1
        if is_ip:
          if contrib in stats_ips:
            stats_ips[contrib] += 1
          else:
            stats_ips[contrib] = 1
          count_ips += 1
        else:
          if contrib in stats_contribs:
            stats_contribs[contrib] += 1
          else:
            stats_contribs[contrib] = 1
          count_contributors += 1
        if date in stats_dates:
          stats_dates[date] += 1
        else:
          stats_dates[date] = 1
        cpt_trads += 1
      cpt += 1
      if cpt % 100 == 0:
        print(str(cpt) + " diffs traites (" + str(cpt_trads) + " traductions)")
  
  with open(os.path.join(folder, "stats-trads-res-after-diffs.txt"), "w+", encoding="utf-8") as f:  
    res = "Résultats des stats sur les résumés d'édition tronqués (ajout de traductions par lots) :\n"
    res += "Traductions ajoutées : " + str(cpt_trads) + "\n"
    res += "Traductions ajoutées par des utilisateurs inscrits : " + str(count_contributors) + "\n"
    res += "Traductions ajoutées par des utilisateurs non inscrits : " + str(count_ips) + "\n"
    f.write(res)
    
  with open(os.path.join(folder, "stats-langs-after-diffs.txt"), "w+", encoding="utf-8") as f:
    f.write(str(stats_langs))
  
  with open(os.path.join(folder, "stats-months-after-diffs.txt"), "w+", encoding="utf-8") as f:
    f.write(str(stats_dates))
  
  with open(os.path.join(folder, "stats-trads-contributors-after-diffs.txt"), "w+", encoding="utf-8") as f:
    f.write(str(stats_contribs))
      
  with open(os.path.join(folder, "stats-trads-ips-after-diffs.txt"), "w+", encoding="utf-8") as f:
    f.write(str(stats_ips))
    
if __name__ == '__main__':
    start_time = time.time()
    count_trads_per_diff()
    print("--- %s seconds ---" % (time.time() - start_time))

    # Création des fichiers de stats
    raw_results_to_wikitable(file_dates, dest='wikitable-dates.txt', header1='Mois', header2='Traductions ajoutées')
    raw_results_to_wikitable(file_langs, dest='wikitable-langs.txt', header1='Langue', header2='Traductions ajoutées')
    raw_results_to_wikitable(file_ips, dest='wikitable-ips.txt', header1='Utilisateur non enregistré', header2='Traductions ajoutées')
    raw_results_to_wikitable(file_contributors, dest='wikitable-contributors.txt', header1='Utilisateur enregistré', header2='Traductions ajoutées')
    
    # Disabling graphs generation, as the Graph extension is disabled on Wikimedia wikis
    # as of 1/2024 due to security issues.
    # generate_langs_graph()
    # generate_langs_graph(15, 15, filename_var='langs2')
    # generate_dates_graph()

Ce script a pris ~20 minutes à s’exécuter (Windows 10, processeur quadricore (2.3GHz), 8 Go de RAM)

Il fait référence au fichier graph_template.txt suivant pour la génération des graphiques :

{{#tag:graph|
{
  "version": 4,
  "width": 1000,
  "height": 200,
  "padding": {"top": 20, "left": 65, "bottom": 60, "right": 10},
 
  "data": [
    {
      "name": "table",
      "values": [
__TO_REPLACE__
      ]
    }
  ],
 
  "signals": [
    {
      "name": "tooltip",
      "init": {},
      "streams": [
        {"type": "rect:mouseover", "expr": "datum"},
        {"type": "rect:mouseout", "expr": "{}"}
      ]
    }
  ],
 
  "predicates": [
    {
      "name": "tooltip", "type": "==",
      "operands": [{"signal": "tooltip._id"}, {"arg": "id"}]
    }
  ],
 
  "scales": [
    { "name": "xscale", "type": "ordinal", "range": "width",
      "domain": {"data": "table", "field": "__LABEL__"} },
    { "name": "yscale", "type": "linear", "range": "height",
      "domain": {"data": "table", "field": "amount"} }
  ],
 
  "axes": [
    { "type": "x", "scale": "xscale"__OBLIQUE_LABELS__},
    { "type": "y", "scale": "yscale" }
  ],
 
  "marks": [
    {
      "type": "rect",
      "from": {"data":"table"},
      "properties": {
        "enter": {
          "x": {"scale": "xscale", "field": "__LABEL__"},
          "width": {"scale": "xscale", "band": true, "offset": -1},
          "y": {"scale": "yscale", "field": "amount"},
          "y2": {"field": {"group": "height"} }
        },
        "update": { "fill": {"value": "steelblue"} },
        "hover": { "fill": {"value": "red"} }
      }
    },
    {
      "type": "text",
      "properties": {
        "enter": {
          "align": {"value": "center"},
          "fill": {"value": "#333"}
        },
        "update": {
          "x": {"scale": "xscale", "signal": "tooltip.__LABEL__"},
          "dx": {"scale": "xscale", "band": true, "mult": 0.5},
          "y": {"scale": "yscale", "signal": "tooltip.amount", "offset": -5},
          "text": {"signal": "tooltip.amount"},
          "fillOpacity": {
            "rule": [
              {
                "predicate": {"name": "tooltip", "id": {"value": null} },
                "value": 0
              },
              {"value": 1}
            ]
          }
        }
      }
    }
  ]
}
| mode=interactive }}