#!/usr/bin/perl use strict; use JSON; use DBI; use GD::Simple; use Data::Dumper; use Data::UUID; use warnings; $Data::Dumper::Sortkeys = 1; my $conf = load_conf("../etc/autodoc.json"); my $dbh = sqlconnect($conf->{sql}); my $queuedir = $conf->{path}{global}.'/'.$conf->{path}{queue}; my $originaldir = $conf->{path}{global}.'/'.$conf->{path}{original}; opendir(Q,$queuedir); foreach my $file ( readdir(Q) ) { if ( $file =~ /^([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})_([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\.([a-z]+)$/ ) { my $docid = $1; my $ext = $3; print "Found document id $docid of type $ext\n"; if ( $ext eq 'pdf' ) { my @pages; for(my $page=0;; $page++) { my $txt = ''; print "texting page $page\n"; open(TXT,sprintf("pdftotext -f %s -l %s %s/%s - 2>/dev/null |", $page+1, $page+1, $queuedir, $file)) || last; while() { chomp; $txt .= ' ' . $_; } close(TXT); # end of PDF last if $?; my %lang = detect_lang($txt); my $pageid = get_new_page($docid); print "new page id $pageid\n"; system(sprintf("pdftoppm -f %s -l %s -r 600 -jpeg -singlefile %s/%s %s/%s.jpeg", $page+1, $page+1, $queuedir, $file, $originaldir, $pageid)); #system(sprintf("mv %s/%s.jpeg.jpg %s/%s.jpeg", print "create original page jpeg $pageid.jpeg"; } } else { } } } closedir(Q); # open queue # decompose PDF # normalise all files as jpegs # generate page # ocr / lang detect # update db sub detect_lang { my($txt) = @_; my @larr; my %lcnt; foreach my $word ( split(/[ '".-]/,$txt) ) { $word = lc($word); my $lang; print "$word "; my $q = sqlquery($dbh, "SELECT lang FROM dict WHERE word like ?", $word); while(my ($l) = $q->fetchrow_array()) { print "$l "; $lang = $l; } print "\n"; push @larr, $lang if defined $lang; $lcnt{$lang}++ if defined $lang; } print Dumper(\%lcnt); } sub get_new_page { my($docid) = @_; my $pageid = gen_uuid(); sqlquery($dbh, " INSERT INTO pages SET id = ?, created = NOW(), status = 'inprogress'", $pageid); sqlquery($dbh, " INSERT INTO documents_pages SET documentId = ?, pageId = ?", $docid, $pageid); return $pageid; } sub gen_uuid { my $ug = Data::UUID->new; return lc($ug->create_str()); } sub load_conf { my($file) = @_; my $x=''; open(F,"$file") || die "Failed to load configuration file"; while() { $x.=$_; } close(F); return from_json($x); } sub sqlconnect { my($sql) = @_; my $dsn = "DBI:mysql:database=$sql->{base};host=$sql->{host}"; my $dbh = DBI->connect($dsn, $sql->{user}, $sql->{pass}) || \\ die "Failed to connect to database"; return $dbh; } sub sqlquery { my $dbh = shift; my $query = shift; my @args = @_; #print STDERR "$query\n"; my $sth = $dbh->prepare($query) || die "Failed to execute SQL query"; $sth->execute(@args) || die "Failed to execute SQL query"; return $sth; }