Blame view

ingester/scripts/extract_pdftext.pl 1.16 KB
9d00822b4   Paulo Gra├ža   Initial commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
  #!/perl/bin/perl -w
  
  # First install CAM::PDF, example: cpan install CAM::PDF
  # http://www.cyberciti.biz/faq/converter-pdf-files-to-text-format-command/
  # First install poppler-utils
  
  use strict;                     # Good practice
  use warnings;                   # Good practice
  use Data::Dumper qw(Dumper);
  
  
  # (1) quit unless we have the correct number of command-line args
  my $num_args = $#ARGV + 1;
  if ($num_args != 1) {
      print STDERR "
  Usage: extract_pdftext.pl path/to/filename.pdf
  ";
      exit;
  }
  
  my $filename=$ARGV[0];
  unless (-e $filename) { 
      print STDERR "
  File: $filename doesn't exist!
  "; 
      exit;
  }
  
  
  # read only the first page to STDOUT
  open ( my $fh, "-|","pdftotext -l 1 $filename -") or
      die "error extracting $filename";
  
  
  my @DOIS;
  if (open(my $fh, "-|","pdftotext -l 1 $filename -")) {
    while (my $row = <$fh>) {
      chomp $row;
      #print "$row
  ";
      @DOIS = ($row =~ m/\b(10[.][0-9]{4,}(?:[.][0-9]+)*\/(?:(?!["&\'<>])\S)+)\b/g);
      if (scalar @DOIS ne 0) {
          last;
      }
    }
  }
  
  #TODO: this is reading only the first element found
  if (scalar @DOIS ne 0) {
      print STDERR "$DOIS[0] - matches the DOI pattern
  "; 
      print "$DOIS[0]";
  }