Utilisateur:Jeannot12/maj hanja.pl
Apparence
#!/usr/bin/perl use strict ; use utf8 ; use open ':utf8'; binmode(STDIN, ":utf8"); binmode(STDOUT, ":utf8"); binmode(STDERR, ":utf8"); my $delay = 1 ; my $delay = 30 ; my $srcdir = "/home/jean/wfs/wiktionary-fr"; #my $dstdir = "/home/jean/new"; my $dstdir = "/home/jean/wfs/wiktionary-fr"; # Lit la liste des caractères a traiter my @caracteres = () ; open(LST,"/home/jean/db/jeudessai") || die "cannot open hanja list\n"; binmode(LST, ":utf8"); while(<LST>) { chop; push @caracteres, $_; } close(LST); # Lecture eumhun my %hanja = () ; open(H,"/home/jean/db/nabi-hanja.txt"); binmode(H, ":utf8"); while(<H>) { my $k ; my $v ; next if(/^\[/) ; chop; ($k,$v) = split(/=/,$_); if ($v =~ /^(.)/) { $v =~ s/(.)의 譌字/(forme incorrecte de [[$1]])/; $v =~ s/(.)의 簡體字/(forme incorrecte de [[$1]])/; $v =~ s/(.)(과|와) 同字/(forme alternative de [[$1]])/; $v =~ s/(.)·(.)(과|와) 同字/(forme alternative de [[$1]] et [[$2]])/; $v =~ s/(.)의 古字/(forme archaïque de [[$1]])/; $v =~ s/(.)의 本字/(forme originale de [[$1]])/; $v =~ s/(.)의 俗字/(forme familière de [[$1]])/; $v =~ s/(.)의 訛字/(forme incorrecte pour [[$1]])/; $v =~ s/(.)의 正字/(forme correcte de [[$1]])/; $v =~ s/(.)의 갖은자/(variante sémantique de [[$1]])/; $v =~ s/(.)의 略字/(forme abregée de [[$1]])/; if ($hanja{$k}) { $hanja{$k} .= ", " . $v ; } else { $hanja{$k} = $v ; } } } close(H); # On lit l'article et on le decompose en 4 parties : # - avant le bloc hanja # - le bloc hanja en lui meme (pour lequel on récupère les valeurs des items) # - apres le bloc hanja # - la fin (si le bloc hanja est absent) # my @listitem = ('Hangul','roman-ko', 'McCune', 'eumhun', 'Yale-ko' ) ; my $ignore = '(^$|-pron|ébauche|\{\{API\}\}|\{\{SAMPA\}\})|Romanisation :|\{\{-réf-\}\}|\{\{R:libhangul\}\}' ; my $car = '' ; foreach $car (@caracteres) { print STDERR "DEBUG : traitement de $car...\n" ; my $etat = 0 ; my $avant = ""; my $hanjabloc = ""; my $apres = ""; my $ja = ""; my $voir = ""; my $item = ''; my %val = () ; open(A,"$srcdir/$car"); binmode(A, ":utf8"); while(<A>) { # Debut du paragraphe "hanja" if(/\[\[Catégorie/ || /\[\[..:$car/) { print STDERR "etat=apres\n"; $apres .= $_ ; $etat = 2; } elsif(/\{\{-voir-\}\}/) { print STDERR "etat=voir\n"; $etat = 4; $voir .= $_; } elsif(/\{\{=ja=\}\}/) { print STDERR "etat=ja\n"; $ja .= $_ ; $etat = 3; } elsif(/\{\{=ko-hanja=\}\}/) { print STDERR "etat=ko\n"; $etat = 1; } elsif($etat == 1) { if (/\{\{=/ || /\[\[en/ || /\{\{-voir/) { print STDERR "etat=apres (ko)\n"; $apres .= $_ ; $etat = 2; } else { chop; next if(/$ignore/) ; my $found = 0 ; foreach $item (@listitem) { if(/\{\{$item\}\}\s*:\s*(\S*.*)/) { $val{$item} = $1 ; $found = 1; } } if ($found ==0 ) { print STDERR "DEBUG : $_ : no item found\n" ; } } } elsif($etat == 4) { $voir .= $_; } elsif($etat == 3) { $ja .= $_ ; } elsif($etat == 2) { $apres .= $_ ; } elsif($etat == 0) { $avant .= $_ ; } } close(A); # On prend le eumhun de nabi-hanja.txt if ($hanja{$car}) { $val{'eumhun'} = $hanja{$car} ; # On recupere le hangul de eumhun my @tmp = split(/,/ , $val{'eumhun'}) ; my %hangul = () ; my $eumhun = '' ; foreach $eumhun (@tmp) { if ($eumhun =~ /\s*(\S)\s*$/) { my $hangulcar = $1 ; my $n = ord($hangulcar) ; if ($n>=0xac00 && $n<=0xd7a3) { $hangul{$1} = 1; } } } my $newhangul = join(", ",keys(%hangul)) ; # Si hangul de l'article different de eumhun, on le remplace if ($val{'Hangul'} ne $newhangul) { $val{'Hangul'} = $newhangul } } else { print STDERR "pas de eumhun pour $car\n"; } # Si le hangul existe... if($val{'Hangul'}) { PrivoxyWindowOpen(NEW,">$dstdir/$car"); # On réordonne les valeurs du block hanja print NEW $avant, $ja, $voir, " {{=ko-hanja=}} {{ébauche|ko}} * {{Hangul}} : ",$val{'Hangul'}," * {{eumhun}} : ",$val{'eumhun'}," * Romanisation : ** {{roman-ko}} : ",$val{'roman-ko'}," ** {{McCune}} : ",$val{'McCune'}," ** {{Yale-ko}} : ",$val{'Yale-ko'}," {{-réf-}} {{R:libhangul}} ", $apres, " [[Summary: reformatage section hanja, ajout eumhun, replacement -voir-]] " ; close(NEW); } else { print STDERR "Pas de modification pour $car\n"; } print STDERR "attente de $delay secondes...\n"; sleep($delay); }