123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150 |
- #!/usr/bin/perl
-
- use strict;
- use JSON;
- use DBI;
- use GD::Simple;
- use Data::Dumper;
- use Data::UUID;
- use warnings;
-
- $Data::Dumper::Sortkeys = 1;
-
- my $conf = load_conf("../etc/autodoc.json");
- my $dbh = sqlconnect($conf->{sql});
-
- my $queuedir = $conf->{path}{global}.'/'.$conf->{path}{queue};
- my $originaldir = $conf->{path}{global}.'/'.$conf->{path}{original};
-
- opendir(Q,$queuedir);
- foreach my $file ( readdir(Q) ) {
- if ( $file =~ /^([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})_([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\.([a-z]+)$/ ) {
- my $docid = $1;
- my $ext = $3;
-
- print "Found document id $docid of type $ext\n";
-
- if ( $ext eq 'pdf' ) {
- my @pages;
- for(my $page=0;; $page++) {
- my $txt = '';
- print "texting page $page\n";
- open(TXT,sprintf("pdftotext -f %s -l %s %s/%s - 2>/dev/null |", $page+1, $page+1, $queuedir, $file)) || last;
- while(<TXT>) {
- chomp;
- $txt .= ' ' . $_;
- }
- close(TXT);
-
- # end of PDF
- last if $?;
-
- my %lang = detect_lang($txt);
- my $pageid = get_new_page($docid);
- print "new page id $pageid\n";
-
- system(sprintf("pdftoppm -f %s -l %s -r 600 -jpeg -singlefile %s/%s %s/%s.jpeg", $page+1, $page+1, $queuedir, $file, $originaldir, $pageid));
- #system(sprintf("mv %s/%s.jpeg.jpg %s/%s.jpeg",
- print "create original page jpeg $pageid.jpeg";
- }
- }
- else {
- }
- }
- }
- closedir(Q);
-
- # open queue
- # decompose PDF
- # normalise all files as jpegs
- # generate page
- # ocr / lang detect
- # update db
-
- sub detect_lang {
- my($txt) = @_;
-
- my @larr;
- my %lcnt;
-
- foreach my $word ( split(/ /,$txt) ) {
- $word = lc($word);
- my $lang;
- print "$word ";
- my $q = sqlquery($dbh, "SELECT lang FROM dict WHERE word like ?", $word);
- while(my ($l) = $q->fetchrow_array()) {
- print "$l ";
- $lang = $l;
- }
- print "\n";
-
- push @larr, $lang if defined $lang;
- $lcnt{$lang}++ if defined $lang;
- }
-
- print Dumper(\%lcnt);
-
- }
-
- sub get_new_page {
- my($docid) = @_;
-
- my $pageid = gen_uuid();
-
- sqlquery($dbh, "
- INSERT INTO
- pages
- SET
- id = ?,
- created = NOW(),
- status = 'inprogress'",
- $pageid);
-
- sqlquery($dbh, "
- INSERT INTO
- documents_pages
- SET
- documentId = ?,
- pageId = ?", $docid, $pageid);
-
- return $pageid;
- }
-
- sub gen_uuid {
- my $ug = Data::UUID->new;
- return lc($ug->create_str());
- }
-
- sub load_conf {
- my($file) = @_;
-
- my $x='';
-
- open(F,"$file") || die "Failed to load configuration file";
- while(<F>) { $x.=$_; }
- close(F);
-
- return from_json($x);
- }
-
- sub sqlconnect {
- my($sql) = @_;
-
- my $dsn = "DBI:mysql:database=$sql->{base};host=$sql->{host}";
- my $dbh = DBI->connect($dsn, $sql->{user}, $sql->{pass}) || \\
- die "Failed to connect to database";
-
- return $dbh;
- }
-
- sub sqlquery {
- my $dbh = shift;
- my $query = shift;
- my @args = @_;
-
- #print STDERR "$query\n";
-
- my $sth = $dbh->prepare($query) || die "Failed to execute SQL query";
- $sth->execute(@args) || die "Failed to execute SQL query";
- return $sth;
- }
|