/usr/share/perl5/Gscan2pdf/Cuneiform.pm is in gscan2pdf 1.2.3-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 | package Gscan2pdf::Cuneiform;
use 5.008005;
use strict;
use warnings;
use Carp;
use File::Temp; # To create temporary files
use Gscan2pdf::Document; # for slurp
use version;
our $VERSION = '1.2.3';
my ( %languages, $version, $setup, $logger );
sub setup {
( my $class, $logger ) = @_;
return $version if $setup;
my ( $out, $err ) = Gscan2pdf::Document::open_three('which cuneiform');
return if ( not defined($out) or $out eq '' );
( $out, $err ) = Gscan2pdf::Document::open_three("cuneiform");
if ( $out =~ /^Cuneiform\ for\ Linux\ ([\d\.]+)/x ) {
$version = $1;
}
$setup = 1;
return $version;
}
sub languages {
unless (%languages) {
# cuneiform language codes
my %lang = (
eng => 'English',
ger => 'German',
fra => 'French',
rus => 'Russian',
swe => 'Swedish',
spa => 'Spanish',
ita => 'Italian',
ruseng => 'Russian+English',
ukr => 'Ukrainian',
srp => 'Serbian',
hrv => 'Croatian',
pol => 'Polish',
dan => 'Danish',
por => 'Portuguese',
dut => 'Dutch',
cze => 'Czech',
rum => 'Romanian',
hun => 'Hungarian',
bul => 'Bulgarian',
slo => 'Slovak',
slv => 'Slovenian',
lav => 'Latvian',
lit => 'Lithuanian',
est => 'Estonian',
tur => 'Turkish',
);
# Dig out supported languages
my $cmd = "cuneiform -l";
$logger->info($cmd);
( my $output, undef ) = Gscan2pdf::Document::open_three($cmd);
my $langs;
if ( $output =~ /Supported\ languages:\ (.*)\./x ) {
$langs = $1;
for ( split " ", $langs ) {
if ( defined $lang{$_} ) {
$languages{$_} = $lang{$_};
}
else {
$languages{$_} = $_;
}
}
}
else {
$logger->info("Unrecognised output from cuneiform: $output");
}
}
return \%languages;
}
sub hocr {
my ( $class, $file, $language, $loggr, $pidfile ) = @_;
my ($bmp);
Gscan2pdf::Cuneiform->setup($loggr) unless $setup;
# Temporary filename for output
my $txt = File::Temp->new( SUFFIX => '.txt' );
if ( version->parse("v$version") < version->parse('v1.1.0')
and $file !~ /\.bmp$/x )
{
# Temporary filename for new file
$bmp = File::Temp->new( SUFFIX => '.bmp' );
my $image = Image::Magick->new;
$image->Read($file);
# Force TrueColor, as this produces DirectClass, which is what cuneiform expects.
# Without this, PseudoClass is often produced, for which cuneiform gives
# "PUMA_XFinalrecognition failed" warnings
$image->Write( filename => $bmp, type => 'TrueColor' );
}
else {
$bmp = $file;
}
my $cmd = "cuneiform -l $language -f hocr -o $txt $bmp";
$logger->info($cmd);
if ( defined $pidfile ) {
system("echo $$ > $pidfile;$cmd");
}
else {
system($cmd);
}
return Gscan2pdf::Document::slurp($txt);
}
1;
__END__
|