@@ -0,0 +1,147 @@ | |||
#!/usr/bin/perl | |||
use strict; | |||
use JSON; | |||
use DBI; | |||
use GD::Simple; | |||
use Data::Dumper; | |||
use Data::UUID; | |||
use warnings; | |||
$Data::Dumper::Sortkeys = 1; | |||
my $conf = load_conf("../etc/autodoc.json"); | |||
my $dbh = sqlconnect($conf->{sql}); | |||
my $queuedir = $conf->{path}{global}.'/'.$conf->{path}{queue}; | |||
my $originaldir = $conf->{path}{global}.'/'.$conf->{path}{original}; | |||
opendir(Q,$queuedir); | |||
foreach my $file ( readdir(Q) ) { | |||
if ( $file =~ /^([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})_([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\.([a-z]+)$/ ) { | |||
my $docid = $1; | |||
my $ext = $3; | |||
print "Found document id $docid of type $ext\n"; | |||
if ( $ext eq 'pdf' ) { | |||
my @pages; | |||
for(my $page=0;; $page++) { | |||
my $txt = ''; | |||
print "texting page $page\n"; | |||
open(TXT,sprintf("pdftotext -f %s -l %s %s/%s - 2>/dev/null |", $page+1, $page+1, $queuedir, $file)) || last; | |||
while(<TXT>) { | |||
chomp; | |||
$txt .= ' ' . $_; | |||
} | |||
close(TXT); | |||
# end of PDF | |||
last if $?; | |||
my %lang = detect_lang($txt); | |||
my $pageid = get_new_page($docid); | |||
print "new page id $pageid\n"; | |||
system(sprintf("pdftoppm -f %s -l %s -r 600 -jpeg -singlefile %s/%s %s/%s.jpeg", $page+1, $page+1, $queuedir, $file, $originaldir, $pageid)); | |||
#system(sprintf("mv %s/%s.jpeg.jpg %s/%s.jpeg", | |||
print "create original page jpeg $pageid.jpeg"; | |||
} | |||
} | |||
else { | |||
} | |||
} | |||
} | |||
closedir(Q); | |||
# open queue | |||
# decompose PDF | |||
# normalise all files as jpegs | |||
# generate page | |||
# ocr / lang detect | |||
# update db | |||
sub detect_lang { | |||
my($txt) = @_; | |||
my @larr; | |||
my %lcnt; | |||
foreach my $word ( split(/ /,$txt) ) { | |||
$word = lc($word); | |||
my $lang; | |||
my $q = sqlquery($dbh, "SELECT lang FROM dict WHERE word like ?", $word); | |||
while(my ($l) = $q->fetchrow_array()) { | |||
$lang = $l; | |||
} | |||
push @larr, $lang if defined $lang; | |||
$lcnt{$lang}++ if defined $lang; | |||
} | |||
print Dumper(\%lcnt); | |||
} | |||
sub get_new_page { | |||
my($docid) = @_; | |||
my $pageid = gen_uuid(); | |||
sqlquery($dbh, " | |||
INSERT INTO | |||
pages | |||
SET | |||
id = ?, | |||
created = NOW(), | |||
status = 'inprogress'", | |||
$pageid); | |||
sqlquery($dbh, " | |||
INSERT INTO | |||
documents_pages | |||
SET | |||
documentId = ?, | |||
pageId = ?", $docid, $pageid); | |||
return $pageid; | |||
} | |||
sub gen_uuid { | |||
my $ug = Data::UUID->new; | |||
return lc($ug->create_str()); | |||
} | |||
sub load_conf { | |||
my($file) = @_; | |||
my $x=''; | |||
open(F,"$file") || die "Failed to load configuration file"; | |||
while(<F>) { $x.=$_; } | |||
close(F); | |||
return from_json($x); | |||
} | |||
sub sqlconnect { | |||
my($sql) = @_; | |||
my $dsn = "DBI:mysql:database=$sql->{base};host=$sql->{host}"; | |||
my $dbh = DBI->connect($dsn, $sql->{user}, $sql->{pass}) || \\ | |||
die "Failed to connect to database"; | |||
return $dbh; | |||
} | |||
sub sqlquery { | |||
my $dbh = shift; | |||
my $query = shift; | |||
my @args = @_; | |||
#print STDERR "$query\n"; | |||
my $sth = $dbh->prepare($query) || die "Failed to execute SQL query"; | |||
$sth->execute(@args) || die "Failed to execute SQL query"; | |||
return $sth; | |||
} |
@@ -0,0 +1,53 @@ | |||
#!/usr/bin/perl | |||
use strict; | |||
use JSON; | |||
use DBI; | |||
use warnings; | |||
my $conf = load_conf("../etc/autodoc.json"); | |||
my $dbh = sqlconnect($conf->{sql}); | |||
my @del = ( | |||
'pages', | |||
'documents', | |||
'tags', | |||
'words' | |||
); | |||
foreach ( @del ) { | |||
print "DELETE FROM $_\n"; | |||
sqlquery($dbh, "DELETE FROM $_"); | |||
} | |||
sub load_conf { | |||
my($file) = @_; | |||
my $x=''; | |||
open(F,"$file") || fatal_api_error(500,"Failed to load configuration file"); | |||
while(<F>) { $x.=$_; } | |||
close(F); | |||
return from_json($x); | |||
} | |||
sub sqlconnect { | |||
my($sql) = @_; | |||
my $dsn = "DBI:mysql:database=$sql->{base};host=$sql->{host}"; | |||
my $dbh = DBI->connect($dsn, $sql->{user}, $sql->{pass}) || \\ | |||
print STDERR "Failed to connect to database\n"; | |||
return $dbh; | |||
} | |||
sub sqlquery { | |||
my $dbh = shift; | |||
my $query = shift; | |||
my @args = @_; | |||
my $sth = $dbh->prepare($query) || print STDERR $dbh->error(); | |||
$sth->execute(@args) || print STDERR $sth->error(); | |||
return $sth; | |||
} |
@@ -0,0 +1,91 @@ | |||
#!/usr/bin/perl | |||
use strict; | |||
use JSON; | |||
use DBI; | |||
use utf8; | |||
use GD::Simple; | |||
use Data::Dumper; | |||
use Data::UUID; | |||
use warnings; | |||
$|=1; | |||
$Data::Dumper::Sortkeys = 1; | |||
my $conf = load_conf("../etc/autodoc.json"); | |||
my $dbh = sqlconnect($conf->{sql}); | |||
my %lang = ( | |||
'fr' => [ 'french' ], | |||
'de' => [ 'swiss', 'ngerman' ], | |||
'en' => [ 'british-english-large', 'american-english-large' ], | |||
'it' => [ 'italian' ], | |||
); | |||
foreach my $lang ( sort keys %lang ) { | |||
print "Loading language $lang ...\n"; | |||
foreach my $dict ( @{$lang{$lang}} ) { | |||
my $file = '/usr/share/dict/'.$dict; | |||
my $len; | |||
open(LEN,"wc -l $file |"); | |||
while(<LEN>) { if ( /(\d+)/ ) { $len = $1; } } | |||
close(LEN); | |||
next if !defined $len; | |||
my $cnt = 0; | |||
my $start = time(); | |||
open(DICT,$file); | |||
while(<DICT>) { | |||
chomp; | |||
sqlquery($dbh, "INSERT IGNORE INTO dict SET word = ?, lang = ?", | |||
$_, $lang); | |||
$cnt++; | |||
if ( ! ( $cnt % 777 ) || $cnt eq $len ) { | |||
printf("\t%s %s/%s (%i%%) ETA: %ss \r", | |||
$dict, $cnt, $len, int($cnt/$len*100), | |||
time()-$start eq 0 ? '-' : int( ( (time()-$start)/$cnt*$len ) - ( time()-$start ) ) | |||
); | |||
} | |||
} | |||
close(DICT); | |||
printf("\n"); | |||
} | |||
} | |||
sub load_conf { | |||
my($file) = @_; | |||
my $x=''; | |||
open(F,"$file") || die "Failed to load configuration file"; | |||
while(<F>) { $x.=$_; } | |||
close(F); | |||
return from_json($x); | |||
} | |||
sub sqlconnect { | |||
my($sql) = @_; | |||
my $dsn = "DBI:mysql:database=$sql->{base};host=$sql->{host}"; | |||
my $dbh = DBI->connect($dsn, $sql->{user}, $sql->{pass}, { | |||
mysql_enable_utf8 => 1 | |||
}) || die "Failed to connect to database"; | |||
return $dbh; | |||
} | |||
sub sqlquery { | |||
my $dbh = shift; | |||
my $query = shift; | |||
my @args = @_; | |||
#print STDERR "$query\n"; | |||
my $sth = $dbh->prepare($query) || die "Failed to execute SQL query"; | |||
$sth->execute(@args) || die "Failed to execute SQL query"; | |||
return $sth; | |||
} |