Browse Source

pdf txt extraction

master
root 5 years ago
parent
commit
10ff99f344
3 changed files with 23 additions and 24 deletions
  1. 20
    24
      cgi/autodoc.fcgi
  2. 1
    0
      etc/autodoc.json.default
  3. 2
    0
      src/commands.txt

+ 20
- 24
cgi/autodoc.fcgi View File

use DBI; use DBI;
use GD::Simple; use GD::Simple;
use Data::Dumper; use Data::Dumper;
use Data::UUID;
use warnings; use warnings;


$Data::Dumper::Sortkeys = 1; $Data::Dumper::Sortkeys = 1;


my($code, $hdr, $type, $data) = process_query($method, $path, $qs, $post, $user); my($code, $hdr, $type, $data) = process_query($method, $path, $qs, $post, $user);


print STDERR Dumper("code",$code,"hdr", $hdr, "type", $type, "data", $data);
#print STDERR Dumper("code",$code,"hdr", $hdr, "type", $type, "data", $data);


$hdr = [ ] if !defined $hdr; $hdr = [ ] if !defined $hdr;


send_response($code, $hdr, $type, $data); send_response($code, $hdr, $type, $data);
} }


sub gen_uuid {
my $ug = Data::UUID->new;
return lc($ug->create_str());
}

sub send_response { sub send_response {
my ($code, $hdr, $type, $data) = @_; my ($code, $hdr, $type, $data) = @_;


$size, $size,
$dst $dst
); );
print STDERR "CMD=$cmd\n";
#print STDERR "CMD=$cmd\n";
system($cmd); system($cmd);
} }


sub api_v1_POST_documents { sub api_v1_POST_documents {
my($id, $qs, $post, $user) = @_; my($id, $qs, $post, $user) = @_;


$id = gen_uuid();

my $q = sqlquery($dbh, " my $q = sqlquery($dbh, "
INSERT INTO documents INSERT INTO documents
SET SET
id = ?,
owner = ?, owner = ?,
status = 'nodata' status = 'nodata'
", $user);
$q = sqlquery($dbh, "SELECT LAST_INSERT_ID()");
while(my($lastid) = $q->fetchrow_array()) {
$id = $lastid;
}
", $id, $user);


return db_get_document_object($id); return db_get_document_object($id);
} }
sub api_v1_POST_documents_id_data { sub api_v1_POST_documents_id_data {
my($id, $qs, $post, $user) = @_; my($id, $qs, $post, $user) = @_;


my $pageid;
my $uuid = gen_uuid();
my $ext = $post->{ctype};
$ext =~ s/.*\///;


sqlquery($dbh, "
INSERT INTO
pages
SET
owner = ?,
documentId = ?,
contenttype = ?,
created = NOW(),
status = 'inprogress'",
$user, $id, $post->{ctype});
my $q = sqlquery($dbh, "SELECT LAST_INSERT_ID()");
while(my($last) = $q->fetchrow_array()) {
$pageid = $last;
}
my $file = sprintf("%s/%s/%s_%s.%s",
$conf->{path}{global},
$conf->{path}{queue},
$id, $uuid, $ext);


my $file = $conf->{path}{global} . '/' . $conf->{path}{originals} . '/' . $pageid;
open(F,'>'.$file); open(F,'>'.$file);
print F $post->{data}; print F $post->{data};
close(F); close(F);
my $query = shift; my $query = shift;
my @args = @_; my @args = @_;


print STDERR "$query\n";
#print STDERR "$query\n";


my $sth = $dbh->prepare($query) || fatal_api_error(500,"Failed to execute SQL query"); my $sth = $dbh->prepare($query) || fatal_api_error(500,"Failed to execute SQL query");
$sth->execute(@args) || fatal_api_error(500,"Failed to execute SQL query"); $sth->execute(@args) || fatal_api_error(500,"Failed to execute SQL query");

+ 1
- 0
etc/autodoc.json.default View File

"path": { "path": {
"global": "/opt/autodoc", "global": "/opt/autodoc",
"original": "var/original", "original": "var/original",
"queue": "var/queue",
"images": "var/images", "images": "var/images",
"cache": "var/cache", "cache": "var/cache",
"error_img": "var/error_img.jpeg" "error_img": "var/error_img.jpeg"

+ 2
- 0
src/commands.txt View File

apt-get install aspell-fr aspell-it aspell-de aspell-en apt-get install aspell-fr aspell-it aspell-de aspell-en
apt-get install wfrench wbritish-large witalian wswiss wngerman wamerican-large
apt-get install tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-ita tesseract-ocr-eng apt-get install tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-ita tesseract-ocr-eng
apt-get install poppler-utils


cd /opt/autodoc/www/js/ cd /opt/autodoc/www/js/
npm install popper.js --save npm install popper.js --save

Loading…
Cancel
Save