#!/usr/bin/perl use strict; use JSON; use DBI; use GD::Simple; use Data::Dumper; use Data::UUID; use File::Temp; use warnings; $Data::Dumper::Sortkeys = 1; my $conf = load_conf("../etc/autodoc.json"); my $dbh = sqlconnect($conf->{sql}); my $queuedir = $conf->{path}{global}.'/'.$conf->{path}{queue}; my $originaldir = $conf->{path}{global}.'/'.$conf->{path}{original}; my %langid; my $q = sqlquery($dbh, "SELECT id,short FROM lang"); while(my($id,$short)=$q->fetchrow_array()) { $langid{$short}=$id; } opendir(Q,$queuedir); foreach my $file ( readdir(Q) ) { if ( $file =~ /^([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})_([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\.([a-z]+)$/ ) { my $docid = $1; my $ext = $3; print "Found document id $docid of type $ext\n"; if ( $ext eq 'pdf' ) { my @pages; for(my $page=0;; $page++) { my $txt = ''; print "texting page $page\n"; open(TXT,sprintf("pdftotext -f %s -l %s %s/%s - 2>/dev/null |", $page+1, $page+1, $queuedir, $file)) || last; while() { chomp; $txt .= ' ' . $_; } close(TXT); # end of PDF last if $?; my ($lang,$words) = detect_lang($txt); print "language is $lang\n"; #spell_check($txt,$lang); my $pageid = get_new_page($docid); print "new page id $pageid\n"; update_page_status($pageid, 'inprogress'); print "create original page jpeg $pageid.jpeg"; system(sprintf("pdftoppm -f %s -l %s -r 300 -jpeg -singlefile %s/%s %s/%s", $page+1, $page+1, $queuedir, $file, $originaldir, $pageid)); system(sprintf("mv %s/%s.jpg %s/%s.jpeg", $originaldir, $pageid, $originaldir, $pageid )); create_page_words($pageid, $lang, $words); update_page_status($pageid, 'ok'); } } else { } } } closedir(Q); # open queue # decompose PDF # normalise all files as jpegs # generate page # ocr / lang detect # update db sub create_page_words { my($pageid, $lang, $words) = @_; foreach my $word ( @{$words} ) { sqlquery($dbh, "CALL add_page_word(?,?,?)", $pageid, $word, $langid{$lang}); } } sub spell_check { my($txt, $lang) = @_; my $tmp = File::Temp->new(); print $tmp $txt; open(F, sprintf("cat %s | aspell --lang=%s --ignore-case pipe |",$tmp->filename, $lang)); while() { print $_; } close(F); } sub detect_lang { my($txt) = @_; my %lcnt; my @words; foreach my $word ( split(/[^a-zA-Z\x{c380}-\x{c3bf}]/,$txt) ) { next if length $word < 3; $word = lc($word); push @words, $word; my $lang; my $q = sqlquery($dbh, "SELECT lang FROM dict WHERE word like ?", $word); while(my ($l) = $q->fetchrow_array()) { $lcnt{$l}++; } } print Dumper(\%lcnt); my $max = 0; my $lmax; foreach my $lang ( keys %lcnt ) { $lmax = $lang if !defined $lmax; if ( $lcnt{$lang} > $max ) { $lmax = $lang; $max = $lcnt{$lang}; } } return ($lmax, \@words); } sub update_page_status { my($pageid, $status) = @_; sqlquery($dbh, "CALL update_page_status(?,?)",$pageid, $status); } sub get_new_page { my($docid) = @_; my $pageid; my $q = sqlquery($dbh, "CALL create_page(?)", $docid); while(my($id)=$q->fetchrow_array()) { $pageid = $id; } return $pageid; } sub gen_uuid { my $ug = Data::UUID->new; return lc($ug->create_str()); } sub load_conf { my($file) = @_; my $x=''; open(F,"$file") || die "Failed to load configuration file"; while() { $x.=$_; } close(F); return from_json($x); } sub sqlconnect { my($sql) = @_; my $dsn = "DBI:mysql:database=$sql->{base};host=$sql->{host}"; my $dbh = DBI->connect($dsn, $sql->{user}, $sql->{pass}) || \\ die "Failed to connect to database"; return $dbh; } sub sqlquery { my $dbh = shift; my $query = shift; my @args = @_; #print STDERR "$query\n"; my $sth = $dbh->prepare($query) || die "Failed to execute SQL query"; $sth->execute(@args) || die "Failed to execute SQL query"; return $sth; }