Utilisateur:Kipmaster/replace-test

ci-joint le super script de simulation trop génial, décrit sur ma page utilisateur...
ï»¿# -*- coding: utf-8  -*-
"""
This bot will make direct text replacements. It will retrieve information on
which pages might need changes either from an SQL dump or a text file, or only
change a single page.

You can run the bot with the following commandline parameters:

-sql         - Retrieve information from a local SQL dump (cur table, see
               http://download.wikimedia.org).
               Argument can also be given as "-sql:filename".
-file        - Retrieve information from a local text file.
               Will read any [[wiki link]] and use these articles.
               Argument can also be given as "-file:filename".
-page        - Only edit a single page.
               Argument can also be given as "-page:pagename". You can give this
               parameter multiple times to edit multiple pages.
-regex       - Make replacements using regular expressions. If this argument
               isn't given, the bot will make simple text replacements.
-except:XYZ  - Ignore pages which contain XYZ. If the -regex argument is given,
               XYZ will be regarded as a regular expression.
-fix:XYZ     - Perform one of the predefined replacements tasks, which are given
               in the dictionary 'fixes' defined inside this file.
               The -regex argument and given replacements will be ignored if
               you use -fix.
               Currently available predefined fixes are:
                   * HTML - convert HTML tags to wiki syntax, and fix XHTML
-namespace:n - Namespace to process. Works only with a sql dump
-always      - Don't prompt you for each replacement
other:       - First argument is the old text, second argument is the new text.
               If the -regex argument is given, the first argument will be
               regarded as a regular expression, and the second argument might
               contain expressions like \\1 or \g<name>.
      
NOTE: Only use either -sql or -file or -page, but don't mix them.

Examples:

If you want to change templates from the old syntax, e.g. {{msg:Stub}}, to the
new syntax, e.g. {{Stub}}, download an SQL dump file (cur table) from
http://download.wikimedia.org, then use this command:

    python replace.py -sql -regex "{{msg:(.*?)}}" "{{\\1}}"

If you have a dump called foobar.sql and want to fix typos, e.g.
Errror -> Error, use this:

    python replace.py -sql:foobar.sql "Errror" "Error"

If you have a page called 'John Doe' and want to convert HTML tags to wiki
syntax, use:
    
    python replace.py -page:John_Doe -fix:HTML
"""
#
# (C) Daniel Herding, 2004
#
# Distributed under the terms of the PSF license.
#

from __future__ import generators
import sys, re
import wikipedia, config

# Summary messages in different languages
# NOTE: Predefined replacement tasks might use their own dictionary, see 'fixes'
# below.
msg = {
       'de':u'Bot: Automatisierte Textersetzung %s',
       'en':u'Robot: Automated text replacement %s',
       'es':u'Robot: Reemplazo automÃ¡tico de texto %s',
       'fr':u'Bot : Remplacement de texte automatisÃ© %s',
       'hu':u'Robot: Automatikus szÃ¶vegcsere %s',
       'is':u'VÃ©lmenni: breyti texta %s',
       'pt':u'Bot: MudanÃ§a automÃ¡tica %s',
       }

# Predefined replacements tasks.
fixes = {
    'cat': {
        'regex': True,
        'msg': {
            'de':u'Bot: entferne Grundkategorien'
            },
        'replacements': {
            r'\[\[Kategorie:Zeit:.+?\]\][\r\n ]*': '',
            r'\[\[Kategorie:Raum:.+?\]\][\r\n ]*': '',
            r'\[\[Kategorie:Thema:.+?\]\][\r\n ]*': '',
            r'\[\[Kategorie:Typ:.+?\]\][\r\n ]*': '',
        }
    },
            
    # These replacements will convert HTML to wiki syntax where possible, and
    # make remaining tags XHTML compliant.
    'HTML': {
        'regex': True,
        # We don't want to mess up pages which discuss HTML tags, so we skip
        # all pages which contain nowiki tags.
        'exceptions':  ['<nowiki>'],
        'msg': {
               'en':u'Robot: converting/fixing HTML',
               'de':u'Bot: konvertiere/korrigiere HTML',
              },
        'replacements': {
            # Everything case-insensitive (?i)
            # Keep in mind that MediaWiki automatically converts <br> to <br />
            # when rendering pages, so you might comment the next two lines out
            # to save some time/edits.
            #r'(?i)<br>':                      r'<br />',
            # linebreak with attributes
            #r'(?i)<br ([^>/]+?)>':            r'<br \1 />',
            r'(?i)<b>(.*?)</b>':              r"'''\1'''",
            r'(?i)<strong>(.*?)</strong>':    r"'''\1'''",
            r'(?i)<i>(.*?)</i>':              r"''\1''",
            r'(?i)<em>(.*?)</em>':            r"''\1''",
            # horizontal line without attributes in a single line
            r'(?i)([\r\n])<hr[ /]*>([\r\n])': r'\1----\2',
            # horizontal line without attributes with more text in the same line
            r'(?i) +<hr[ /]*> +':             r'\r\n----\r\n',
            # horizontal line with attributes; can't be done with wiki syntax
            # so we only make it XHTML compliant
            r'(?i)<hr ([^>/]+?)>':            r'<hr \1 />',
            # a header where only spaces are in the same line
            r'(?i)([\r\n]) *<h1> *([^<]+?) *</h1> *([\r\n])':  r"\1= \2 =\3",
            r'(?i)([\r\n]) *<h2> *([^<]+?) *</h2> *([\r\n])':  r"\1== \2 ==\3",
            r'(?i)([\r\n]) *<h3> *([^<]+?) *</h3> *([\r\n])':  r"\1=== \2 ===\3",
            r'(?i)([\r\n]) *<h4> *([^<]+?) *</h4> *([\r\n])':  r"\1==== \2 ====\3",
            r'(?i)([\r\n]) *<h5> *([^<]+?) *</h5> *([\r\n])':  r"\1===== \2 =====\3",
            r'(?i)([\r\n]) *<h6> *([^<]+?) *</h6> *([\r\n])':  r"\1====== \2 ======\3",
            # TODO: maybe we can make the bot replace <p> tags with \r\n's.
        }
    },
    # Grammar fixes for German language
    'grammar-de': {
        'regex': True,
        'exceptions':  ['sic!'],
        'msg': {
               'de':u'Bot: korrigiere Grammatik',
              },
        'replacements': {
            u'([Ss]owohl) ([^,\.]+?), als auch':              r'\1 \2 als auch',
            #u'([Ww]eder) ([^,\.]+?), noch':                   r'\1 \2 noch',
            u'(?=\W)(\d[\d\.\,]*\d|\d)($|â¬|DM|mg|g|kg|l|t|ms|s|min|h|Âµm|mm|cm|dm|m|km|Â°C|K|kB|MB|TB)(?=\W)': r'\1 \2',
            u'(\d+)\.(Januar|Februar|MÃ€rz|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember)':   r'\1. \2',
        }
    },
    
}

class ReplacePageGenerator:
    """
    Generator which will yield PageLinks for pages that might contain text to
    replace. These pages might be retrieved from a local SQL dump file or a
    text file, or as a list of pages entered by the user.

    Arguments:
        * source       - Where the bot should retrieve the page list from.
                         Can be 'sqldump', 'textfile' or 'userinput'.
        * replacements - A dictionary where keys are original texts and values
                         are replacement texts.
        * exceptions   - A list of strings; pages which contain one of these
                         won't be changed.
        * regex        - If the entries of replacements and exceptions should
                         be interpreted as regular expressions
        * namespace    - Namespace to process in case of a SQL dump. -1 means
                         that all namespaces should be searched.
        * textfilename - The textfile's path, either absolute or relative, which
                         will be used when source is 'textfile'.
        * sqlfilename  - The dump's path, either absolute or relative, which
                         will be used when source is 'sqldump'.
        * pagenames    - a list of pages which will be used when source is
                         'userinput'.
    """
    def __init__(self, source, replacements, exceptions, regex = False, namespace = -1, textfilename = None, sqlfilename = None, pagenames = None):
        self.source = source
        self.replacements = replacements
        self.exceptions = exceptions
        self.regex = regex
        self.namespace = namespace
        self.textfilename = textfilename
        self.sqlfilename = sqlfilename
        self.pagenames = pagenames
    
    def read_pages_from_sql_dump(self):
        """
        Generator which will yield PageLinks to pages that might contain text to
        replace. These pages will be retrieved from a local sql dump file
        (cur table).
    
        Arguments:
            * sqlfilename  - the dump's path, either absolute or relative
            * replacements - a dictionary where old texts are keys and new texts
                             are values
            * exceptions   - a list of strings; pages which contain one of these
                             won't be changed.
            * regex        - if the entries of replacements and exceptions should
                             be interpreted as regular expressions
        """
        #mysite = wikipedia.getSite()
        import sqldump
        dump = sqldump.SQLdump(self.sqlfilename, wikipedia.myencoding())
        for entry in dump.entries():
            skip_page = False
            if self.namespace != -1 and self.namespace != entry.namespace:
                continue
            if "Wiktionary:" in entry.full_title() :
                continue
            else:
                for exception in self.exceptions:
                    if self.regex:
                        exception = re.compile(exception)
                        if exception.search(entry.text):
                            skip_page = True
                            break
                    else:
                        if exception in entry.text:
                            skip_page = True
                            break
            if not skip_page:
                for old in self.replacements:
                    if self.regex:
                        old = re.compile(old)
                        if old.search(entry.text):
                            yield entry
#                            yield wikipedia.PageLink(mysite, entry.full_title())
                            break
                    else:
                        if old in entry.text:
#                            yield wikipedia.PageLink(mysite, entry.full_title())
                            yield entry
                            break
    
    def read_pages_from_text_file(self):
        """
        Generator which will yield pages that are listed in a text file created by
        the bot operator. Will regard everything inside [[double brackets]] as a
        page name, and yield PageLinks for these pages.
    
        Arguments:
            * textfilename - the textfile's path, either absolute or relative
        """
        f = open(self.textfilename, 'r')
        # regular expression which will find [[wiki links]]
        R = re.compile(r'.*\[\[([^\]]*)\]\].*')
        m = False
        for line in f:
            # BUG: this will only find one link per line.
            # TODO: use findall() instead.
            m=R.match(line)
            if m:
                yield wikipedia.PageLink(wikipedia.getSite(), m.group(1))
        f.close()
    
    def read_pages_from_wiki_page(self):
        '''
        Generator which will yield pages that are listed in a wiki page. Will
        regard everything inside [[double brackets]] as a page name, except for
        interwiki and category links, and yield PageLinks for these pages.
    
        Arguments:
            * pagetitle - the title of a page on the home wiki
        '''
        listpage = wikipedia.PageLink(wikipedia.getSite(), self.pagetitle)
        list = wikipedia.get(listpage, read_only = True)
        # TODO - UNFINISHED
    
    # TODO: Make MediaWiki's search feature available.
    def generate(self):
        '''
        Starts the generator.
        '''
        if self.source == 'sqldump':
            for pl in self.read_pages_from_sql_dump():
                yield pl
        elif self.source == 'textfile':
            for pl in self.read_pages_from_text_file():
                yield pl
        elif self.source == 'userinput':
            for pagename in self.pagenames:
                yield wikipedia.PageLink(wikipedia.getSite(), pagename)

class ReplaceRobot:
    def __init__(self, generator, replacements, exceptions = [], regex = False, acceptall = False):
        self.generator = generator
        self.replacements = replacements
        self.exceptions = exceptions
        self.regex = regex
        self.acceptall = acceptall
        
    def checkExceptions(self, original_text):
        """
        If one of the exceptions applies for the given text, returns the 
        substring. which matches the exception. Otherwise it returns None.
        """
        for exception in self.exceptions:
            if self.regex:
                exception = re.compile(exception)
                hit = exception.search(original_text)
                if hit:
                    return hit.group(0)
            else:
                hit = original_text.find(exception)
                if hit != -1:
                    return original_text[hit:hit + len(exception)]
        return None

    def doReplacements(self, original_text):
        """
        Returns the text which is generated by applying all replacements to the
        given text.
        """
        new_text = original_text
        for old, new in self.replacements.iteritems():
            if self.regex:
                # TODO: compiling the regex each time might be inefficient
                oldR = re.compile(old)
                new_text = oldR.sub(new, new_text)
            else:
                new_text = new_text.replace(old, new)
        return new_text
        
    def run(self):
        """
        Starts the robot.
        """
        # Run the generator which will yield PageLinks to pages which might need to be
        # changed.
        for pl in self.generator.generate():
            print ''
	    # rajouts moi
	    wikipedia.output(u'Page %s en cours' % pl.full_title())
            try:
                # Load the page's text from the wiki
#                original_text = pl.get()
                original_text = pl.text
            except wikipedia.NoPage:
#                wikipedia.output(u'Page %s not found' % pl.linkname())
                wikipedia.output(u'Page %s not found' % pl.full_title())
                continue
            except wikipedia.LockedPage:
#                wikipedia.output(u'Skipping locked page %s' % pl.linkname())
                wikipedia.output(u'Skipping locked page %s' % pl.full_title())
                continue
            except wikipedia.IsRedirectPage:
                continue
            match = self.checkExceptions(original_text)
            # skip all pages that contain certain texts
            if match:
                wikipedia.output(u'Skipping %s because it contains %s' % (pl.linkname(), match))
            else:
                new_text = self.doReplacements(original_text)
                if new_text == original_text:
#                    wikipedia.output('No changes were necessary in %s' % pl.linkname())
                    wikipedia.output('No changes were necessary in %s' % pl.full_title())
                else:
                    wikipedia.showColorDiff(original_text, new_text)
                    if not self.acceptall:
                        choice = wikipedia.input(u'Do you want to accept these changes? [y|n|a(ll)]')
                        if choice in ['a', 'A']:
                            self.acceptall = True
                    if self.acceptall or choice in ['y', 'Y']:
                        wikipedia.output(u'ok (simulation : on touche a rien)')
#                        pl.put(new_text)
    
def main():
    # How we want to retrieve information on which pages need to be changed.
    # Can either be 'sqldump', 'textfile' or 'userinput'.
    source = None
    # Array which will collect commandline parameters.
    # First element is original text, second element is replacement text.
    commandline_replacements = []
    # A dictionary where keys are original texts and values are replacement texts.
    replacements = {}
    # Don't edit pages which contain certain texts.
    exceptions = []
    # Should the elements of 'replacements' and 'exceptions' be interpreted
    # as regular expressions?
    regex = False
    # Predefined fixes from dictionary 'fixes' (see above).
    fix = None
    # the dump's path, either absolute or relative, which will be used when source
    # is 'sqldump'.
    sqlfilename = ''
    # the textfile's path, either absolute or relative, which will be used when
    # source is 'textfile'.
    textfilename = ''
    # a list of pages which will be used when source is 'userinput'.
    pagenames = []
    # will become True when the user presses a ('yes to all') or uses the -always
    # commandline paramater.
    acceptall = False
    # Which namespace should be processed when using a SQL dump
    # default to -1 which means all namespaces will be processed
    namespace = -1
    # Load default summary message.
    wikipedia.setAction(wikipedia.translate(wikipedia.getSite(), msg))

    # Read commandline parameters.
    for arg in sys.argv[1:]:
        arg = wikipedia.argHandler(arg, logname = 'replace.log')
        if arg:
            if arg == '-regex':
                regex = True
            elif arg.startswith('-file'):
                if len(arg) == 5:
                    textfilename = wikipedia.input(u'Please enter the filename:')
                else:
                    textfilename = arg[6:]
                source = 'textfile'
            elif arg.startswith('-sql'):
                if len(arg) == 4:
                    sqlfilename = wikipedia.input(u'Please enter the SQL dump\'s filename:')
                else:
                    sqlfilename = arg[5:]
                source = 'sqldump'
            elif arg.startswith('-page'):
                if len(arg) == 5:
                    pagenames.append(wikipedia.input(u'Which page do you want to chage?'))
                else:
                    pagenames.append(arg[6:])
                source = 'userinput'
            elif arg.startswith('-except:'):
                exceptions.append(arg[8:])
            elif arg.startswith('-fix:'):
                fix = arg[5:]
            elif arg == '-always':
                acceptall = True
            elif arg.startswith('-namespace:'):
                namespace = int(arg[11:])
            else:
                commandline_replacements.append(arg)

    if source == None or len(commandline_replacements) not in [0, 2]:
        # syntax error, show help text from the top of this file
        wikipedia.output(__doc__, 'utf-8')
        wikipedia.stopme()
        sys.exit()
    if (len(commandline_replacements) == 2 and fix == None):
        replacements[commandline_replacements[0]] = commandline_replacements[1]
        wikipedia.setAction(wikipedia.translate(wikipedia.getSite(), msg ) % ' (-' + commandline_replacements[0] + ' +' + commandline_replacements[1] + ')')
    elif fix == None:
        old = wikipedia.input(u'Please enter the text that should be replaced:')
        new = wikipedia.input(u'Please enter the new text:')
        change = '(-' + old + ' +' + new
        replacements[old] = new
        while True:
            old = wikipedia.input(u'Please enter another text that should be replaced, or press Enter to start:')
            if old == '':
                change = change + ')'
                break
            new = wikipedia.input(u'Please enter the new text:')
            change = change + ' & -' + old + ' +' + new
            replacements[old] = new
        default_summary_message =  wikipedia.translate(wikipedia.getSite(), msg) % change
        wikipedia.output(u'The summary message will default to: %s' % default_summary_message)
        summary_message = wikipedia.input(u'Press Enter to use this default message, or enter a description of the changes your bot will make:')
        if summary_message == '':
            summary_message = default_summary_message
        wikipedia.setAction(summary_message)
    else:
        # Perform one of the predefined actions.
        try:
            fix = fixes[fix]
        except KeyError:
            wikipedia.output(u'Available predefined fixes are: %s' % fixes.keys())
            wikipedia.stopme()
            sys.exit()
        if 'regex' in fix:
            regex = fix['regex']
        if 'msg' in fix:
            wikipedia.setAction(wikipedia.translate(wikipedia.getSite(), fix['msg']))
        if 'exceptions' in fix:
            exceptions = fix['exceptions']
        replacements = fix['replacements']
    
    gen = ReplacePageGenerator(source, replacements, exceptions, regex, namespace, textfilename, sqlfilename, pagenames)
    bot = ReplaceRobot(gen, replacements, exceptions, regex, acceptall)
    bot.run()


if __name__ == "__main__":
    try:
        main()
    except:
        wikipedia.stopme()
        raise
    else:
        wikipedia.stopme()