#!/usr/bin/perl | |||||
use strict; | |||||
use JSON; | |||||
use DBI; | |||||
use GD::Simple; | |||||
use Data::Dumper; | |||||
use Data::UUID; | |||||
use warnings; | |||||
$Data::Dumper::Sortkeys = 1; | |||||
my $conf = load_conf("../etc/autodoc.json"); | |||||
my $dbh = sqlconnect($conf->{sql}); | |||||
my $queuedir = $conf->{path}{global}.'/'.$conf->{path}{queue}; | |||||
my $originaldir = $conf->{path}{global}.'/'.$conf->{path}{original}; | |||||
opendir(Q,$queuedir); | |||||
foreach my $file ( readdir(Q) ) { | |||||
if ( $file =~ /^([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})_([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\.([a-z]+)$/ ) { | |||||
my $docid = $1; | |||||
my $ext = $3; | |||||
print "Found document id $docid of type $ext\n"; | |||||
if ( $ext eq 'pdf' ) { | |||||
my @pages; | |||||
for(my $page=0;; $page++) { | |||||
my $txt = ''; | |||||
print "texting page $page\n"; | |||||
open(TXT,sprintf("pdftotext -f %s -l %s %s/%s - 2>/dev/null |", $page+1, $page+1, $queuedir, $file)) || last; | |||||
while(<TXT>) { | |||||
chomp; | |||||
$txt .= ' ' . $_; | |||||
} | |||||
close(TXT); | |||||
# end of PDF | |||||
last if $?; | |||||
my %lang = detect_lang($txt); | |||||
my $pageid = get_new_page($docid); | |||||
print "new page id $pageid\n"; | |||||
system(sprintf("pdftoppm -f %s -l %s -r 600 -jpeg -singlefile %s/%s %s/%s.jpeg", $page+1, $page+1, $queuedir, $file, $originaldir, $pageid)); | |||||
#system(sprintf("mv %s/%s.jpeg.jpg %s/%s.jpeg", | |||||
print "create original page jpeg $pageid.jpeg"; | |||||
} | |||||
} | |||||
else { | |||||
} | |||||
} | |||||
} | |||||
closedir(Q); | |||||
# open queue | |||||
# decompose PDF | |||||
# normalise all files as jpegs | |||||
# generate page | |||||
# ocr / lang detect | |||||
# update db | |||||
sub detect_lang { | |||||
my($txt) = @_; | |||||
my @larr; | |||||
my %lcnt; | |||||
foreach my $word ( split(/ /,$txt) ) { | |||||
$word = lc($word); | |||||
my $lang; | |||||
my $q = sqlquery($dbh, "SELECT lang FROM dict WHERE word like ?", $word); | |||||
while(my ($l) = $q->fetchrow_array()) { | |||||
$lang = $l; | |||||
} | |||||
push @larr, $lang if defined $lang; | |||||
$lcnt{$lang}++ if defined $lang; | |||||
} | |||||
print Dumper(\%lcnt); | |||||
} | |||||
sub get_new_page { | |||||
my($docid) = @_; | |||||
my $pageid = gen_uuid(); | |||||
sqlquery($dbh, " | |||||
INSERT INTO | |||||
pages | |||||
SET | |||||
id = ?, | |||||
created = NOW(), | |||||
status = 'inprogress'", | |||||
$pageid); | |||||
sqlquery($dbh, " | |||||
INSERT INTO | |||||
documents_pages | |||||
SET | |||||
documentId = ?, | |||||
pageId = ?", $docid, $pageid); | |||||
return $pageid; | |||||
} | |||||
sub gen_uuid { | |||||
my $ug = Data::UUID->new; | |||||
return lc($ug->create_str()); | |||||
} | |||||
sub load_conf { | |||||
my($file) = @_; | |||||
my $x=''; | |||||
open(F,"$file") || die "Failed to load configuration file"; | |||||
while(<F>) { $x.=$_; } | |||||
close(F); | |||||
return from_json($x); | |||||
} | |||||
sub sqlconnect { | |||||
my($sql) = @_; | |||||
my $dsn = "DBI:mysql:database=$sql->{base};host=$sql->{host}"; | |||||
my $dbh = DBI->connect($dsn, $sql->{user}, $sql->{pass}) || \\ | |||||
die "Failed to connect to database"; | |||||
return $dbh; | |||||
} | |||||
sub sqlquery { | |||||
my $dbh = shift; | |||||
my $query = shift; | |||||
my @args = @_; | |||||
#print STDERR "$query\n"; | |||||
my $sth = $dbh->prepare($query) || die "Failed to execute SQL query"; | |||||
$sth->execute(@args) || die "Failed to execute SQL query"; | |||||
return $sth; | |||||
} |
#!/usr/bin/perl | |||||
use strict; | |||||
use JSON; | |||||
use DBI; | |||||
use warnings; | |||||
my $conf = load_conf("../etc/autodoc.json"); | |||||
my $dbh = sqlconnect($conf->{sql}); | |||||
my @del = ( | |||||
'pages', | |||||
'documents', | |||||
'tags', | |||||
'words' | |||||
); | |||||
foreach ( @del ) { | |||||
print "DELETE FROM $_\n"; | |||||
sqlquery($dbh, "DELETE FROM $_"); | |||||
} | |||||
sub load_conf { | |||||
my($file) = @_; | |||||
my $x=''; | |||||
open(F,"$file") || fatal_api_error(500,"Failed to load configuration file"); | |||||
while(<F>) { $x.=$_; } | |||||
close(F); | |||||
return from_json($x); | |||||
} | |||||
sub sqlconnect { | |||||
my($sql) = @_; | |||||
my $dsn = "DBI:mysql:database=$sql->{base};host=$sql->{host}"; | |||||
my $dbh = DBI->connect($dsn, $sql->{user}, $sql->{pass}) || \\ | |||||
print STDERR "Failed to connect to database\n"; | |||||
return $dbh; | |||||
} | |||||
sub sqlquery { | |||||
my $dbh = shift; | |||||
my $query = shift; | |||||
my @args = @_; | |||||
my $sth = $dbh->prepare($query) || print STDERR $dbh->error(); | |||||
$sth->execute(@args) || print STDERR $sth->error(); | |||||
return $sth; | |||||
} |
#!/usr/bin/perl | |||||
use strict; | |||||
use JSON; | |||||
use DBI; | |||||
use utf8; | |||||
use GD::Simple; | |||||
use Data::Dumper; | |||||
use Data::UUID; | |||||
use warnings; | |||||
$|=1; | |||||
$Data::Dumper::Sortkeys = 1; | |||||
my $conf = load_conf("../etc/autodoc.json"); | |||||
my $dbh = sqlconnect($conf->{sql}); | |||||
my %lang = ( | |||||
'fr' => [ 'french' ], | |||||
'de' => [ 'swiss', 'ngerman' ], | |||||
'en' => [ 'british-english-large', 'american-english-large' ], | |||||
'it' => [ 'italian' ], | |||||
); | |||||
foreach my $lang ( sort keys %lang ) { | |||||
print "Loading language $lang ...\n"; | |||||
foreach my $dict ( @{$lang{$lang}} ) { | |||||
my $file = '/usr/share/dict/'.$dict; | |||||
my $len; | |||||
open(LEN,"wc -l $file |"); | |||||
while(<LEN>) { if ( /(\d+)/ ) { $len = $1; } } | |||||
close(LEN); | |||||
next if !defined $len; | |||||
my $cnt = 0; | |||||
my $start = time(); | |||||
open(DICT,$file); | |||||
while(<DICT>) { | |||||
chomp; | |||||
sqlquery($dbh, "INSERT IGNORE INTO dict SET word = ?, lang = ?", | |||||
$_, $lang); | |||||
$cnt++; | |||||
if ( ! ( $cnt % 777 ) || $cnt eq $len ) { | |||||
printf("\t%s %s/%s (%i%%) ETA: %ss \r", | |||||
$dict, $cnt, $len, int($cnt/$len*100), | |||||
time()-$start eq 0 ? '-' : int( ( (time()-$start)/$cnt*$len ) - ( time()-$start ) ) | |||||
); | |||||
} | |||||
} | |||||
close(DICT); | |||||
printf("\n"); | |||||
} | |||||
} | |||||
sub load_conf { | |||||
my($file) = @_; | |||||
my $x=''; | |||||
open(F,"$file") || die "Failed to load configuration file"; | |||||
while(<F>) { $x.=$_; } | |||||
close(F); | |||||
return from_json($x); | |||||
} | |||||
sub sqlconnect { | |||||
my($sql) = @_; | |||||
my $dsn = "DBI:mysql:database=$sql->{base};host=$sql->{host}"; | |||||
my $dbh = DBI->connect($dsn, $sql->{user}, $sql->{pass}, { | |||||
mysql_enable_utf8 => 1 | |||||
}) || die "Failed to connect to database"; | |||||
return $dbh; | |||||
} | |||||
sub sqlquery { | |||||
my $dbh = shift; | |||||
my $query = shift; | |||||
my @args = @_; | |||||
#print STDERR "$query\n"; | |||||
my $sth = $dbh->prepare($query) || die "Failed to execute SQL query"; | |||||
$sth->execute(@args) || die "Failed to execute SQL query"; | |||||
return $sth; | |||||
} |