root пре 5 година
родитељ
комит
a23ea29ec8
3 измењених фајлова са 291 додато и 0 уклоњено
  1. 147
    0
      bin/autodoc_process.pl
  2. 53
    0
      bin/dev_cleanup_db.pl
  3. 91
    0
      bin/load_dicts.pl

+ 147
- 0
bin/autodoc_process.pl Прегледај датотеку

@@ -0,0 +1,147 @@
#!/usr/bin/perl

use strict;
use JSON;
use DBI;
use GD::Simple;
use Data::Dumper;
use Data::UUID;
use warnings;

$Data::Dumper::Sortkeys = 1;

my $conf = load_conf("../etc/autodoc.json");
my $dbh = sqlconnect($conf->{sql});

my $queuedir = $conf->{path}{global}.'/'.$conf->{path}{queue};
my $originaldir = $conf->{path}{global}.'/'.$conf->{path}{original};

opendir(Q,$queuedir);
foreach my $file ( readdir(Q) ) {
if ( $file =~ /^([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})_([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\.([a-z]+)$/ ) {
my $docid = $1;
my $ext = $3;

print "Found document id $docid of type $ext\n";

if ( $ext eq 'pdf' ) {
my @pages;
for(my $page=0;; $page++) {
my $txt = '';
print "texting page $page\n";
open(TXT,sprintf("pdftotext -f %s -l %s %s/%s - 2>/dev/null |", $page+1, $page+1, $queuedir, $file)) || last;
while(<TXT>) {
chomp;
$txt .= ' ' . $_;
}
close(TXT);

# end of PDF
last if $?;

my %lang = detect_lang($txt);
my $pageid = get_new_page($docid);
print "new page id $pageid\n";

system(sprintf("pdftoppm -f %s -l %s -r 600 -jpeg -singlefile %s/%s %s/%s.jpeg", $page+1, $page+1, $queuedir, $file, $originaldir, $pageid));
#system(sprintf("mv %s/%s.jpeg.jpg %s/%s.jpeg",
print "create original page jpeg $pageid.jpeg";
}
}
else {
}
}
}
closedir(Q);

# open queue
# decompose PDF
# normalise all files as jpegs
# generate page
# ocr / lang detect
# update db

sub detect_lang {
my($txt) = @_;

my @larr;
my %lcnt;

foreach my $word ( split(/ /,$txt) ) {
$word = lc($word);
my $lang;
my $q = sqlquery($dbh, "SELECT lang FROM dict WHERE word like ?", $word);
while(my ($l) = $q->fetchrow_array()) {
$lang = $l;
}

push @larr, $lang if defined $lang;
$lcnt{$lang}++ if defined $lang;
}

print Dumper(\%lcnt);

}

sub get_new_page {
my($docid) = @_;

my $pageid = gen_uuid();

sqlquery($dbh, "
INSERT INTO
pages
SET
id = ?,
created = NOW(),
status = 'inprogress'",
$pageid);

sqlquery($dbh, "
INSERT INTO
documents_pages
SET
documentId = ?,
pageId = ?", $docid, $pageid);

return $pageid;
}

sub gen_uuid {
my $ug = Data::UUID->new;
return lc($ug->create_str());
}

sub load_conf {
my($file) = @_;

my $x='';

open(F,"$file") || die "Failed to load configuration file";
while(<F>) { $x.=$_; }
close(F);

return from_json($x);
}

sub sqlconnect {
my($sql) = @_;

my $dsn = "DBI:mysql:database=$sql->{base};host=$sql->{host}";
my $dbh = DBI->connect($dsn, $sql->{user}, $sql->{pass}) || \\
die "Failed to connect to database";

return $dbh;
}

sub sqlquery {
my $dbh = shift;
my $query = shift;
my @args = @_;

#print STDERR "$query\n";

my $sth = $dbh->prepare($query) || die "Failed to execute SQL query";
$sth->execute(@args) || die "Failed to execute SQL query";
return $sth;
}

+ 53
- 0
bin/dev_cleanup_db.pl Прегледај датотеку

@@ -0,0 +1,53 @@
#!/usr/bin/perl

use strict;
use JSON;
use DBI;
use warnings;

my $conf = load_conf("../etc/autodoc.json");
my $dbh = sqlconnect($conf->{sql});

my @del = (
'pages',
'documents',
'tags',
'words'
);

foreach ( @del ) {
print "DELETE FROM $_\n";
sqlquery($dbh, "DELETE FROM $_");
}

sub load_conf {
my($file) = @_;

my $x='';

open(F,"$file") || fatal_api_error(500,"Failed to load configuration file");
while(<F>) { $x.=$_; }
close(F);

return from_json($x);
}

sub sqlconnect {
my($sql) = @_;

my $dsn = "DBI:mysql:database=$sql->{base};host=$sql->{host}";
my $dbh = DBI->connect($dsn, $sql->{user}, $sql->{pass}) || \\
print STDERR "Failed to connect to database\n";

return $dbh;
}

sub sqlquery {
my $dbh = shift;
my $query = shift;
my @args = @_;

my $sth = $dbh->prepare($query) || print STDERR $dbh->error();
$sth->execute(@args) || print STDERR $sth->error();
return $sth;
}

+ 91
- 0
bin/load_dicts.pl Прегледај датотеку

@@ -0,0 +1,91 @@
#!/usr/bin/perl

use strict;
use JSON;
use DBI;
use utf8;
use GD::Simple;
use Data::Dumper;
use Data::UUID;
use warnings;

$|=1;

$Data::Dumper::Sortkeys = 1;

my $conf = load_conf("../etc/autodoc.json");
my $dbh = sqlconnect($conf->{sql});

my %lang = (
'fr' => [ 'french' ],
'de' => [ 'swiss', 'ngerman' ],
'en' => [ 'british-english-large', 'american-english-large' ],
'it' => [ 'italian' ],
);

foreach my $lang ( sort keys %lang ) {
print "Loading language $lang ...\n";
foreach my $dict ( @{$lang{$lang}} ) {
my $file = '/usr/share/dict/'.$dict;
my $len;

open(LEN,"wc -l $file |");
while(<LEN>) { if ( /(\d+)/ ) { $len = $1; } }
close(LEN);

next if !defined $len;
my $cnt = 0;

my $start = time();
open(DICT,$file);
while(<DICT>) {
chomp;
sqlquery($dbh, "INSERT IGNORE INTO dict SET word = ?, lang = ?",
$_, $lang);
$cnt++;
if ( ! ( $cnt % 777 ) || $cnt eq $len ) {
printf("\t%s %s/%s (%i%%) ETA: %ss \r",
$dict, $cnt, $len, int($cnt/$len*100),
time()-$start eq 0 ? '-' : int( ( (time()-$start)/$cnt*$len ) - ( time()-$start ) )
);
}
}
close(DICT);
printf("\n");
}
}

sub load_conf {
my($file) = @_;

my $x='';

open(F,"$file") || die "Failed to load configuration file";
while(<F>) { $x.=$_; }
close(F);

return from_json($x);
}

sub sqlconnect {
my($sql) = @_;

my $dsn = "DBI:mysql:database=$sql->{base};host=$sql->{host}";
my $dbh = DBI->connect($dsn, $sql->{user}, $sql->{pass}, {
mysql_enable_utf8 => 1
}) || die "Failed to connect to database";

return $dbh;
}

sub sqlquery {
my $dbh = shift;
my $query = shift;
my @args = @_;

#print STDERR "$query\n";

my $sth = $dbh->prepare($query) || die "Failed to execute SQL query";
$sth->execute(@args) || die "Failed to execute SQL query";
return $sth;
}

Loading…
Откажи
Сачувај