forked from unfoldingWord/en_tw
107 lines
3.1 KiB
Perl
107 lines
3.1 KiB
Perl
use 5.12.0;
|
|
use utf8;
|
|
use Cwd;
|
|
use File::Slurp;
|
|
use open IO => ":utf8";
|
|
$| = 1;
|
|
$" = "\n";
|
|
|
|
my ($inDir, $outDir) = ("/Users/Henry/Documents/git.Door43/en_ulb", "out");
|
|
my ($pwd, $os, $fileSpec) = (cwd(), $^O, "\.usfm");
|
|
my (@array, @usfmLines);
|
|
my (%hash);
|
|
my ($d, $whatami, $inFile);
|
|
my ($book, $chap, $vers, $text, $outText, $newV, $newC, $newB, $outFile, $usfmText, $metathesis);
|
|
###
|
|
if ($os eq "darwin" || $os eq "linux") {$d = "/"}
|
|
else {$d = "\\"}
|
|
|
|
#if (-e $outDir) {
|
|
# -d _ || die "$whatami: $outDir is not a directory!\n";
|
|
# -r _ && -w _ && -x _ || die "$whatami: $outDir is inaccessible!\n";
|
|
# chdir $outDir;
|
|
# my $glob = unlink glob "*.*";
|
|
# chdir "$pwd";
|
|
#}else{
|
|
# mkdir($outDir, 0755) || die "$whatami: Can't create $outDir!\n";
|
|
#}
|
|
|
|
chdir("$inDir");
|
|
opendir THISDIR, "." or die "serious dainbramage: $!";
|
|
my @infiles = grep /$fileSpec$/i, readdir *THISDIR;
|
|
closedir THISDIR;
|
|
chdir("$pwd");
|
|
|
|
open(LOG, ">:utf8", "$outDir${d}log.log") or die "$outDir${d}log.log:\n$!";
|
|
say "$outDir${d}log.log open";
|
|
ProcessFiles();
|
|
say "Done.";
|
|
close LOG;
|
|
|
|
sub ProcessFiles {
|
|
my $thisLine;
|
|
open(OUT, ">:utf8", "/Users/Henry/Google Drive/WA/Test/Unlocked Bible/ULB text.txt") or die "/Users/Henry/Google Drive/WA/Test/Unlocked Bible/ULB text.txt:\n$!";
|
|
my $finalTextForm;
|
|
@infiles = sort @infiles;
|
|
foreach $inFile (@infiles) {
|
|
my $thisFile;
|
|
say $inDir . "/" . $inFile;
|
|
|
|
$usfmText = read_file("$inDir${d}$inFile", binmode => 'utf8') or die;
|
|
$usfmText =~ s/\r?\n([^\\ \r\n])/ $1/g;
|
|
$usfmText =~ s/\\f \+.*?\\f\*//g;
|
|
$usfmText =~ s/\\pi/\\p/g;
|
|
$usfmText =~ s/\\((ide?)|(toc.)|(mt|sp)|(c \\d+))[^\r\n]*\r?\n//g;
|
|
$usfmText =~ s/\r?\n\\(m|pi?|(q\d?)) ([^\r\n]*)\r\n/$3/g;
|
|
$usfmText =~ s/ +\n/\n/g;
|
|
$usfmText =~ s/(\n\\v \d+)\n/$1 \[blank\]\n/g;
|
|
$usfmText =~ s/ —/—/g;
|
|
#say LOG $usfmText;
|
|
#say LOG "$usfmText\n=====\n";
|
|
@usfmLines = "";
|
|
@usfmLines = split /\r?\n/, $usfmText;
|
|
foreach $thisLine (@usfmLines) {
|
|
chomp $thisLine;
|
|
#say LOG ">\t$thisLine";
|
|
$thisLine =~ s/^(\\q)[\t ]$/$1/;
|
|
#say LOG "<\t$thisLine";
|
|
$thisLine = SearchAndReplace($thisLine);
|
|
$thisFile .= $thisLine;
|
|
}
|
|
$thisFile =~ s/\r?\n>>\t/ /g;
|
|
$thisFile =~ s/>\t//g;
|
|
$thisFile =~ s/(\r?\n){2,}/\n/;
|
|
$thisFile =~ s/— /—/g;
|
|
$thisFile =~ s/\\q\d//g;
|
|
$thisFile =~ s/\\p//g;
|
|
$thisFile =~ s/\\q //g;
|
|
$thisFile =~ s/\\qs( .*)\\qs\* ?/$1/g;
|
|
$thisFile =~ s/\n{2,}/\n/g;
|
|
$thisFile =~ s/ {2,}/ /g;
|
|
say OUT $thisFile;
|
|
}
|
|
close OUT;
|
|
}
|
|
|
|
sub SearchAndReplace {
|
|
my $thisxLine = shift;
|
|
#say LOG $thisxLine;
|
|
$thisxLine =~ s/\\s5.*$/\n-------\n/;
|
|
if ($thisxLine =~ s/\\h (.+) *$//) {$book = $1;$newB = 1}
|
|
elsif ($thisxLine =~ s/\\c (\d+)//) {$chap = $1; $newC = 1}
|
|
elsif ($thisxLine =~ s/\\v (\d+(-\d+)?) (.*)$/$3/) {
|
|
#print OUT "\n$outText\n";
|
|
$vers = $1;
|
|
$newV = 1;
|
|
$thisxLine = "\n$book $chap:$vers\t$metathesis$thisxLine";
|
|
$metathesis = ""
|
|
#say LOG ">\t<$book> $chap:$vers\t$thisxLine";
|
|
}
|
|
elsif ($thisxLine =~ s/^\\q\d? (.*)$/ $1/) {}
|
|
elsif ($thisxLine =~ s/^\\m (.*)$/ $1/) {}
|
|
elsif ($thisxLine =~ s/^\\d (.*)//) {$metathesis = "$1 "}
|
|
elsif ($thisxLine =~ s/^\\[qpm]$//) {}
|
|
#say LOG $thisxLine;
|
|
return $thisxLine;
|
|
}
|