|
|
|
@@ -0,0 +1,147 @@ |
|
|
|
#!/usr/bin/perl |
|
|
|
|
|
|
|
use strict; |
|
|
|
use JSON; |
|
|
|
use DBI; |
|
|
|
use GD::Simple; |
|
|
|
use Data::Dumper; |
|
|
|
use Data::UUID; |
|
|
|
use warnings; |
|
|
|
|
|
|
|
$Data::Dumper::Sortkeys = 1; |
|
|
|
|
|
|
|
my $conf = load_conf("../etc/autodoc.json"); |
|
|
|
my $dbh = sqlconnect($conf->{sql}); |
|
|
|
|
|
|
|
my $queuedir = $conf->{path}{global}.'/'.$conf->{path}{queue}; |
|
|
|
my $originaldir = $conf->{path}{global}.'/'.$conf->{path}{original}; |
|
|
|
|
|
|
|
opendir(Q,$queuedir); |
|
|
|
foreach my $file ( readdir(Q) ) { |
|
|
|
if ( $file =~ /^([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})_([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\.([a-z]+)$/ ) { |
|
|
|
my $docid = $1; |
|
|
|
my $ext = $3; |
|
|
|
|
|
|
|
print "Found document id $docid of type $ext\n"; |
|
|
|
|
|
|
|
if ( $ext eq 'pdf' ) { |
|
|
|
my @pages; |
|
|
|
for(my $page=0;; $page++) { |
|
|
|
my $txt = ''; |
|
|
|
print "texting page $page\n"; |
|
|
|
open(TXT,sprintf("pdftotext -f %s -l %s %s/%s - 2>/dev/null |", $page+1, $page+1, $queuedir, $file)) || last; |
|
|
|
while(<TXT>) { |
|
|
|
chomp; |
|
|
|
$txt .= ' ' . $_; |
|
|
|
} |
|
|
|
close(TXT); |
|
|
|
|
|
|
|
# end of PDF |
|
|
|
last if $?; |
|
|
|
|
|
|
|
my %lang = detect_lang($txt); |
|
|
|
my $pageid = get_new_page($docid); |
|
|
|
print "new page id $pageid\n"; |
|
|
|
|
|
|
|
system(sprintf("pdftoppm -f %s -l %s -r 600 -jpeg -singlefile %s/%s %s/%s.jpeg", $page+1, $page+1, $queuedir, $file, $originaldir, $pageid)); |
|
|
|
#system(sprintf("mv %s/%s.jpeg.jpg %s/%s.jpeg", |
|
|
|
print "create original page jpeg $pageid.jpeg"; |
|
|
|
} |
|
|
|
} |
|
|
|
else { |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
closedir(Q); |
|
|
|
|
|
|
|
# open queue |
|
|
|
# decompose PDF |
|
|
|
# normalise all files as jpegs |
|
|
|
# generate page |
|
|
|
# ocr / lang detect |
|
|
|
# update db |
|
|
|
|
|
|
|
sub detect_lang { |
|
|
|
my($txt) = @_; |
|
|
|
|
|
|
|
my @larr; |
|
|
|
my %lcnt; |
|
|
|
|
|
|
|
foreach my $word ( split(/ /,$txt) ) { |
|
|
|
$word = lc($word); |
|
|
|
my $lang; |
|
|
|
my $q = sqlquery($dbh, "SELECT lang FROM dict WHERE word like ?", $word); |
|
|
|
while(my ($l) = $q->fetchrow_array()) { |
|
|
|
$lang = $l; |
|
|
|
} |
|
|
|
|
|
|
|
push @larr, $lang if defined $lang; |
|
|
|
$lcnt{$lang}++ if defined $lang; |
|
|
|
} |
|
|
|
|
|
|
|
print Dumper(\%lcnt); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
sub get_new_page { |
|
|
|
my($docid) = @_; |
|
|
|
|
|
|
|
my $pageid = gen_uuid(); |
|
|
|
|
|
|
|
sqlquery($dbh, " |
|
|
|
INSERT INTO |
|
|
|
pages |
|
|
|
SET |
|
|
|
id = ?, |
|
|
|
created = NOW(), |
|
|
|
status = 'inprogress'", |
|
|
|
$pageid); |
|
|
|
|
|
|
|
sqlquery($dbh, " |
|
|
|
INSERT INTO |
|
|
|
documents_pages |
|
|
|
SET |
|
|
|
documentId = ?, |
|
|
|
pageId = ?", $docid, $pageid); |
|
|
|
|
|
|
|
return $pageid; |
|
|
|
} |
|
|
|
|
|
|
|
sub gen_uuid { |
|
|
|
my $ug = Data::UUID->new; |
|
|
|
return lc($ug->create_str()); |
|
|
|
} |
|
|
|
|
|
|
|
sub load_conf { |
|
|
|
my($file) = @_; |
|
|
|
|
|
|
|
my $x=''; |
|
|
|
|
|
|
|
open(F,"$file") || die "Failed to load configuration file"; |
|
|
|
while(<F>) { $x.=$_; } |
|
|
|
close(F); |
|
|
|
|
|
|
|
return from_json($x); |
|
|
|
} |
|
|
|
|
|
|
|
sub sqlconnect { |
|
|
|
my($sql) = @_; |
|
|
|
|
|
|
|
my $dsn = "DBI:mysql:database=$sql->{base};host=$sql->{host}"; |
|
|
|
my $dbh = DBI->connect($dsn, $sql->{user}, $sql->{pass}) || \\ |
|
|
|
die "Failed to connect to database"; |
|
|
|
|
|
|
|
return $dbh; |
|
|
|
} |
|
|
|
|
|
|
|
sub sqlquery { |
|
|
|
my $dbh = shift; |
|
|
|
my $query = shift; |
|
|
|
my @args = @_; |
|
|
|
|
|
|
|
#print STDERR "$query\n"; |
|
|
|
|
|
|
|
my $sth = $dbh->prepare($query) || die "Failed to execute SQL query"; |
|
|
|
$sth->execute(@args) || die "Failed to execute SQL query"; |
|
|
|
return $sth; |
|
|
|
} |