|
|
|
|
|
|
|
|
use GD::Simple; |
|
|
use GD::Simple; |
|
|
use Data::Dumper; |
|
|
use Data::Dumper; |
|
|
use Data::UUID; |
|
|
use Data::UUID; |
|
|
|
|
|
use File::Temp; |
|
|
use warnings; |
|
|
use warnings; |
|
|
|
|
|
|
|
|
$Data::Dumper::Sortkeys = 1; |
|
|
$Data::Dumper::Sortkeys = 1; |
|
|
|
|
|
|
|
|
my $queuedir = $conf->{path}{global}.'/'.$conf->{path}{queue}; |
|
|
my $queuedir = $conf->{path}{global}.'/'.$conf->{path}{queue}; |
|
|
my $originaldir = $conf->{path}{global}.'/'.$conf->{path}{original}; |
|
|
my $originaldir = $conf->{path}{global}.'/'.$conf->{path}{original}; |
|
|
|
|
|
|
|
|
|
|
|
my %langid; |
|
|
|
|
|
my $q = sqlquery($dbh, "SELECT id,short FROM lang"); |
|
|
|
|
|
while(my($id,$short)=$q->fetchrow_array()) { $langid{$short}=$id; } |
|
|
|
|
|
|
|
|
opendir(Q,$queuedir); |
|
|
opendir(Q,$queuedir); |
|
|
foreach my $file ( readdir(Q) ) { |
|
|
foreach my $file ( readdir(Q) ) { |
|
|
if ( $file =~ /^([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})_([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\.([a-z]+)$/ ) { |
|
|
if ( $file =~ /^([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})_([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\.([a-z]+)$/ ) { |
|
|
|
|
|
|
|
|
# end of PDF |
|
|
# end of PDF |
|
|
last if $?; |
|
|
last if $?; |
|
|
|
|
|
|
|
|
my %lang = detect_lang($txt); |
|
|
|
|
|
|
|
|
my ($lang,$words) = detect_lang($txt); |
|
|
|
|
|
print "language is $lang\n"; |
|
|
|
|
|
#spell_check($txt,$lang); |
|
|
my $pageid = get_new_page($docid); |
|
|
my $pageid = get_new_page($docid); |
|
|
print "new page id $pageid\n"; |
|
|
print "new page id $pageid\n"; |
|
|
|
|
|
update_page_status($pageid, 'inprogress'); |
|
|
|
|
|
|
|
|
system(sprintf("pdftoppm -f %s -l %s -r 600 -jpeg -singlefile %s/%s %s/%s.jpeg", $page+1, $page+1, $queuedir, $file, $originaldir, $pageid)); |
|
|
|
|
|
#system(sprintf("mv %s/%s.jpeg.jpg %s/%s.jpeg", |
|
|
|
|
|
print "create original page jpeg $pageid.jpeg"; |
|
|
print "create original page jpeg $pageid.jpeg"; |
|
|
|
|
|
system(sprintf("pdftoppm -f %s -l %s -r 300 -jpeg -singlefile %s/%s %s/%s", $page+1, $page+1, $queuedir, $file, $originaldir, $pageid)); |
|
|
|
|
|
system(sprintf("mv %s/%s.jpg %s/%s.jpeg", |
|
|
|
|
|
$originaldir, $pageid, |
|
|
|
|
|
$originaldir, $pageid )); |
|
|
|
|
|
|
|
|
|
|
|
create_page_words($pageid, $lang, $words); |
|
|
|
|
|
|
|
|
|
|
|
update_page_status($pageid, 'ok'); |
|
|
|
|
|
|
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
else { |
|
|
else { |
|
|
|
|
|
|
|
|
# ocr / lang detect |
|
|
# ocr / lang detect |
|
|
# update db |
|
|
# update db |
|
|
|
|
|
|
|
|
|
|
|
sub create_page_words { |
|
|
|
|
|
my($pageid, $lang, $words) = @_; |
|
|
|
|
|
|
|
|
|
|
|
foreach my $word ( @{$words} ) { |
|
|
|
|
|
sqlquery($dbh, "CALL add_page_word(?,?,?)", |
|
|
|
|
|
$pageid, $word, $langid{$lang}); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
sub spell_check { |
|
|
|
|
|
my($txt, $lang) = @_; |
|
|
|
|
|
|
|
|
|
|
|
my $tmp = File::Temp->new(); |
|
|
|
|
|
print $tmp $txt; |
|
|
|
|
|
|
|
|
|
|
|
open(F, sprintf("cat %s | aspell --lang=%s --ignore-case pipe |",$tmp->filename, $lang)); |
|
|
|
|
|
while(<F>) { |
|
|
|
|
|
print $_; |
|
|
|
|
|
} |
|
|
|
|
|
close(F); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sub detect_lang { |
|
|
sub detect_lang { |
|
|
my($txt) = @_; |
|
|
my($txt) = @_; |
|
|
|
|
|
|
|
|
my @larr; |
|
|
|
|
|
my %lcnt; |
|
|
my %lcnt; |
|
|
|
|
|
my @words; |
|
|
|
|
|
|
|
|
foreach my $word ( split(/[ '".-]/,$txt) ) { |
|
|
|
|
|
|
|
|
foreach my $word ( split(/[^a-zA-Z\x{c380}-\x{c3bf}]/,$txt) ) { |
|
|
|
|
|
next if length $word < 3; |
|
|
$word = lc($word); |
|
|
$word = lc($word); |
|
|
|
|
|
push @words, $word; |
|
|
|
|
|
|
|
|
my $lang; |
|
|
my $lang; |
|
|
print "$word "; |
|
|
|
|
|
my $q = sqlquery($dbh, "SELECT lang FROM dict WHERE word like ?", $word); |
|
|
my $q = sqlquery($dbh, "SELECT lang FROM dict WHERE word like ?", $word); |
|
|
while(my ($l) = $q->fetchrow_array()) { |
|
|
while(my ($l) = $q->fetchrow_array()) { |
|
|
print "$l "; |
|
|
|
|
|
$lang = $l; |
|
|
|
|
|
|
|
|
$lcnt{$l}++; |
|
|
} |
|
|
} |
|
|
print "\n"; |
|
|
|
|
|
|
|
|
|
|
|
push @larr, $lang if defined $lang; |
|
|
|
|
|
$lcnt{$lang}++ if defined $lang; |
|
|
|
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
print Dumper(\%lcnt); |
|
|
print Dumper(\%lcnt); |
|
|
|
|
|
|
|
|
|
|
|
my $max = 0; |
|
|
|
|
|
my $lmax; |
|
|
|
|
|
foreach my $lang ( keys %lcnt ) { |
|
|
|
|
|
$lmax = $lang if !defined $lmax; |
|
|
|
|
|
if ( $lcnt{$lang} > $max ) { |
|
|
|
|
|
$lmax = $lang; |
|
|
|
|
|
$max = $lcnt{$lang}; |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
return ($lmax, \@words); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
sub update_page_status { |
|
|
|
|
|
my($pageid, $status) = @_; |
|
|
|
|
|
|
|
|
|
|
|
sqlquery($dbh, "CALL update_page_status(?,?)",$pageid, $status); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
sub get_new_page { |
|
|
sub get_new_page { |
|
|
my($docid) = @_; |
|
|
my($docid) = @_; |
|
|
|
|
|
|
|
|
my $pageid = gen_uuid(); |
|
|
|
|
|
|
|
|
|
|
|
sqlquery($dbh, " |
|
|
|
|
|
INSERT INTO |
|
|
|
|
|
pages |
|
|
|
|
|
SET |
|
|
|
|
|
id = ?, |
|
|
|
|
|
created = NOW(), |
|
|
|
|
|
status = 'inprogress'", |
|
|
|
|
|
$pageid); |
|
|
|
|
|
|
|
|
|
|
|
sqlquery($dbh, " |
|
|
|
|
|
INSERT INTO |
|
|
|
|
|
documents_pages |
|
|
|
|
|
SET |
|
|
|
|
|
documentId = ?, |
|
|
|
|
|
pageId = ?", $docid, $pageid); |
|
|
|
|
|
|
|
|
my $pageid; |
|
|
|
|
|
|
|
|
|
|
|
my $q = sqlquery($dbh, "CALL create_page(?)", $docid); |
|
|
|
|
|
while(my($id)=$q->fetchrow_array()) { |
|
|
|
|
|
$pageid = $id; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
return $pageid; |
|
|
return $pageid; |
|
|
} |
|
|
} |