123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192 |
- #!/usr/bin/perl
-
- use strict;
- use JSON;
- use DBI;
- use GD::Simple;
- use Data::Dumper;
- use Data::UUID;
- use File::Temp;
- use warnings;
-
- $Data::Dumper::Sortkeys = 1;
-
- my $conf = load_conf("../etc/autodoc.json");
- my $dbh = sqlconnect($conf->{sql});
-
- my $queuedir = $conf->{path}{global}.'/'.$conf->{path}{queue};
- my $originaldir = $conf->{path}{global}.'/'.$conf->{path}{original};
-
- my %langid;
- my $q = sqlquery($dbh, "SELECT id,short FROM lang");
- while(my($id,$short)=$q->fetchrow_array()) { $langid{$short}=$id; }
-
- opendir(Q,$queuedir);
- foreach my $file ( readdir(Q) ) {
- if ( $file =~ /^([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})_([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\.([a-z]+)$/ ) {
- my $docid = $1;
- my $ext = $3;
-
- print "Found document id $docid of type $ext\n";
-
- if ( $ext eq 'pdf' ) {
- my @pages;
- for(my $page=0;; $page++) {
- my $txt = '';
- print "texting page $page\n";
- open(TXT,sprintf("pdftotext -f %s -l %s %s/%s - 2>/dev/null |", $page+1, $page+1, $queuedir, $file)) || last;
- while(<TXT>) {
- chomp;
- $txt .= ' ' . $_;
- }
- close(TXT);
-
- # end of PDF
- last if $?;
-
- my ($lang,$words) = detect_lang($txt);
- print "language is $lang\n";
- #spell_check($txt,$lang);
- my $pageid = get_new_page($docid);
- print "new page id $pageid\n";
- update_page_status($pageid, 'inprogress');
-
- print "create original page jpeg $pageid.jpeg";
- system(sprintf("pdftoppm -f %s -l %s -r 300 -jpeg -singlefile %s/%s %s/%s", $page+1, $page+1, $queuedir, $file, $originaldir, $pageid));
- system(sprintf("mv %s/%s.jpg %s/%s.jpeg",
- $originaldir, $pageid,
- $originaldir, $pageid ));
-
- create_page_words($pageid, $lang, $words);
-
- update_page_status($pageid, 'ok');
-
- }
- }
- else {
- }
- }
- }
- closedir(Q);
-
- # open queue
- # decompose PDF
- # normalise all files as jpegs
- # generate page
- # ocr / lang detect
- # update db
-
- sub create_page_words {
- my($pageid, $lang, $words) = @_;
-
- foreach my $word ( @{$words} ) {
- sqlquery($dbh, "CALL add_page_word(?,?,?)",
- $pageid, $word, $langid{$lang});
- }
-
- }
-
- sub spell_check {
- my($txt, $lang) = @_;
-
- my $tmp = File::Temp->new();
- print $tmp $txt;
-
- open(F, sprintf("cat %s | aspell --lang=%s --ignore-case pipe |",$tmp->filename, $lang));
- while(<F>) {
- print $_;
- }
- close(F);
- }
-
-
- sub detect_lang {
- my($txt) = @_;
-
- my %lcnt;
- my @words;
-
- foreach my $word ( split(/[^a-zA-Z\x{c380}-\x{c3bf}]/,$txt) ) {
- next if length $word < 3;
- $word = lc($word);
- push @words, $word;
-
- my $lang;
- my $q = sqlquery($dbh, "SELECT lang FROM dict WHERE word like ?", $word);
- while(my ($l) = $q->fetchrow_array()) {
- $lcnt{$l}++;
- }
- }
-
- print Dumper(\%lcnt);
-
- my $max = 0;
- my $lmax;
- foreach my $lang ( keys %lcnt ) {
- $lmax = $lang if !defined $lmax;
- if ( $lcnt{$lang} > $max ) {
- $lmax = $lang;
- $max = $lcnt{$lang};
- }
- }
-
- return ($lmax, \@words);
- }
-
- sub update_page_status {
- my($pageid, $status) = @_;
-
- sqlquery($dbh, "CALL update_page_status(?,?)",$pageid, $status);
- }
-
- sub get_new_page {
- my($docid) = @_;
-
- my $pageid;
-
- my $q = sqlquery($dbh, "CALL create_page(?)", $docid);
- while(my($id)=$q->fetchrow_array()) {
- $pageid = $id;
- }
-
- return $pageid;
- }
-
- sub gen_uuid {
- my $ug = Data::UUID->new;
- return lc($ug->create_str());
- }
-
- sub load_conf {
- my($file) = @_;
-
- my $x='';
-
- open(F,"$file") || die "Failed to load configuration file";
- while(<F>) { $x.=$_; }
- close(F);
-
- return from_json($x);
- }
-
- sub sqlconnect {
- my($sql) = @_;
-
- my $dsn = "DBI:mysql:database=$sql->{base};host=$sql->{host}";
- my $dbh = DBI->connect($dsn, $sql->{user}, $sql->{pass}) || \\
- die "Failed to connect to database";
-
- return $dbh;
- }
-
- sub sqlquery {
- my $dbh = shift;
- my $query = shift;
- my @args = @_;
-
- #print STDERR "$query\n";
-
- my $sth = $dbh->prepare($query) || die "Failed to execute SQL query";
- $sth->execute(@args) || die "Failed to execute SQL query";
- return $sth;
- }
|