extract_pdftext.pl 1.16 KB
#!/perl/bin/perl -w

# First install CAM::PDF, example: cpan install CAM::PDF
# http://www.cyberciti.biz/faq/converter-pdf-files-to-text-format-command/
# First install poppler-utils

use strict;                     # Good practice
use warnings;                   # Good practice
use Data::Dumper qw(Dumper);


# (1) quit unless we have the correct number of command-line args
my $num_args = $#ARGV + 1;
if ($num_args != 1) {
    print STDERR "\nUsage: extract_pdftext.pl path/to/filename.pdf\n";
    exit;
}

my $filename=$ARGV[0];
unless (-e $filename) { 
    print STDERR "\nFile: $filename doesn't exist!\n"; 
    exit;
}


# read only the first page to STDOUT
open ( my $fh, "-|","pdftotext -l 1 $filename -") or
    die "error extracting $filename";


my @DOIS;
if (open(my $fh, "-|","pdftotext -l 1 $filename -")) {
  while (my $row = <$fh>) {
    chomp $row;
    #print "$row\n";
    @DOIS = ($row =~ m/\b(10[.][0-9]{4,}(?:[.][0-9]+)*\/(?:(?!["&\'<>])\S)+)\b/g);
    if (scalar @DOIS ne 0) {
        last;
    }
  }
}

#TODO: this is reading only the first element found
if (scalar @DOIS ne 0) {
    print STDERR "$DOIS[0] - matches the DOI pattern\n"; 
    print "$DOIS[0]";
}