Commit 9d00822b4913b00b1293665e28b10ff04d141d0d

Authored by Paulo Graça
0 parents
Exists in master

Initial commit

ingester/Readme.md
ingester/examples/example.pdf
No preview for this file type
ingester/examples/example1.pdf
No preview for this file type
ingester/ingest.sh
@@ -0,0 +1,21 @@ @@ -0,0 +1,21 @@
  1 +#!/bin/bash
  2 +
  3 +# find all PDF files
  4 +# find . -type f -name '*.pdf' -exec ./ingest.sh {} \;
  5 +
  6 +while [[ $# > 1 ]]
  7 +do
  8 + echo $1
  9 + shift # past argument or value
  10 +done
  11 +
  12 +mylogger() {
  13 + echo "$(date +%F\ %H:%M:%S:%N): $@"
  14 +}
  15 +
  16 +
  17 +mylogger "start ingesting DOI from PDF: $file"
  18 +
  19 +# Ingest
  20 +# based on DSPACE doc - https://wiki.duraspace.org/display/DSDOC18/Importing+and+Exporting+Items+via+Simple+Archive+Format
  21 +/dspace/bin/dspace import --add --eperson=sama-saw@asa.fccn.pt --collection=10400.25/300 --source=/tmp/test --mapfile=mapfile1
ingester/prepare.sh
@@ -0,0 +1,78 @@ @@ -0,0 +1,78 @@
  1 +#!/bin/bash
  2 +
  3 +# find all PDF files
  4 +# find . -type f -name '*.pdf' -exec ./ingest.sh {} \;
  5 +
  6 +while [[ $# > 1 ]]
  7 +do
  8 + echo $1
  9 + shift # past argument or value
  10 +done
  11 +
  12 +mylogger() {
  13 + echo "$(date +%F\ %H:%M:%S:%N): $@"
  14 +}
  15 +
  16 +file=$1
  17 +if [ ! -f $file ]; then
  18 + mylogger "[ERROR] couldn't find file - $file"
  19 + exit
  20 +fi
  21 +
  22 +filename=$(basename $file)
  23 +
  24 +# Current script dir
  25 +pushd `dirname $0` > /dev/null
  26 +SCRIPTPATH=`pwd`
  27 +popd > /dev/null
  28 +
  29 +mylogger "start extracting DOI from PDF - $file"
  30 +
  31 +DOI=$(perl $SCRIPTPATH/scripts/extract_pdftext.pl $file)
  32 +
  33 +if [ -z "$DOI" ]; then
  34 + mylogger "[ERROR] occurred when extracting the DOI, or it's missing"
  35 + exit
  36 +fi
  37 +
  38 +DOI_prefix=$(echo $DOI|cut -d'/' -f1)
  39 +DOI_suffix=$(echo $DOI|cut -d'/' -f2)
  40 +
  41 +DOI_FILE=$DOI_prefix.$DOI_suffix
  42 +
  43 +
  44 +# download json file
  45 +#http://api.crossref.org/works/10.4025/reveducfis.v22i3.9976
  46 +mylogger "downloading $DOI from crossref"
  47 +
  48 +ERROR_CODE=`echo $(curl --request GET -s -L -D /dev/stdout -o /tmp/$DOI_FILE.json http://api.crossref.org/works/$DOI)| head -n 1 | cut -d$' ' -f2`
  49 +
  50 +
  51 +if [ "$ERROR_CODE" != "200" ]; then
  52 + mylogger "[ERROR] crossref didn't found DOI - $DOI"
  53 + exit
  54 +fi
  55 +
  56 +
  57 +mylogger "extracting metadata from crossref file"
  58 +# extract metadata from file
  59 +perl $SCRIPTPATH/scripts/extract_metadata.pl /tmp/$DOI_FILE.json 1>/tmp/$DOI_FILE.xml
  60 +
  61 +# Prepare
  62 +mylogger "creating folder ./item_$DOI_FILE"
  63 +mkdir -p item_$DOI_FILE
  64 +
  65 +mylogger "mapping data"
  66 +perl $SCRIPTPATH/scripts/map_metadata.pl /tmp/$DOI_FILE.xml dc 1> item_$DOI_FILE/dublin_core.xml
  67 +perl $SCRIPTPATH/scripts/map_metadata.pl /tmp/$DOI_FILE.xml degois 1> item_$DOI_FILE/metadata_degois.xml
  68 +
  69 +mylogger "copying file"
  70 +echo "$filename" > item_$DOI_FILE/contents
  71 +yes | cp -rf $file item_$DOI_FILE/$filename
  72 +
  73 +
  74 +mylogger "cleaning tempfiles"
  75 +rm -rf /tmp/$DOI_FILE.json
  76 +rm -rf /tmp/$DOI_FILE.xml
  77 +
  78 +mylogger "ended"
ingester/scripts/extract_metadata.pl
@@ -0,0 +1,147 @@ @@ -0,0 +1,147 @@
  1 +#!/usr/bin/perl -w
  2 +
  3 +# First install JSON::Parse XML::LibXML XML::LibXSLT
  4 +#requires packages libxml-libxslt-perl, libxml-xpath-perl, libxml-perl
  5 +
  6 +use Data::Dumper; # Perl core module
  7 +use strict; # Good practice
  8 +use warnings; # Good practice
  9 +use JSON::Parse 'json_file_to_perl';
  10 +use XML::LibXML;
  11 +
  12 +# (1) quit unless we have the correct number of command-line args
  13 +my $num_args = $#ARGV + 1;
  14 +if ($num_args != 1) {
  15 + print "\nUsage: extract_metadata.pl path/to/filename.json\n";
  16 + exit;
  17 +}
  18 +
  19 +
  20 +my $filename=$ARGV[0];
  21 +unless (-e $filename) {
  22 + print "\nFile: $filename doesn't exist!\n";
  23 + exit;
  24 +}
  25 +
  26 +my $json_fromfile = json_file_to_perl ($filename);
  27 +my $json_base = $json_fromfile->{message};
  28 +#print Dumper $json_base;
  29 +
  30 +# Create XML document
  31 +my $document = XML::LibXML->createDocument( "1.0", "UTF-8" );
  32 +my $root = $document->createElement( 'crossref' );
  33 +$document->setDocumentElement( $root );
  34 +
  35 +my $subtitles = $document->createElement( 'subtitles' );
  36 +foreach my $item (@{$json_base->{subtitle}}) {
  37 + my $subtitle = $document->createElement( 'subtitle' );
  38 + $subtitle->addChild($document->createTextNode($item));
  39 + $subtitles->addChild($subtitle);
  40 +}
  41 +$root->addChild($subtitles);
  42 +
  43 +my $titles = $document->createElement( 'titles' );
  44 +foreach my $item (@{$json_base->{title}}) {
  45 + my $title = $document->createElement( 'title' );
  46 + $title->addChild($document->createTextNode($item));
  47 + $titles->addChild($title);
  48 +}
  49 +$root->addChild($titles);
  50 +
  51 +my $issue = $document->createElement( 'issue' );
  52 +$issue->addChild($document->createTextNode($json_base->{issue}));
  53 +$root->addChild($issue);
  54 +
  55 +my $type = $document->createElement( 'type' );
  56 +$type->addChild($document->createTextNode($json_base->{type}));
  57 +$root->addChild($type);
  58 +
  59 +my $volume = $document->createElement( 'volume' );
  60 +$volume->addChild($document->createTextNode($json_base->{volume}));
  61 +$root->addChild($volume);
  62 +
  63 +my $container_titles = $document->createElement( 'container-titles' );
  64 +foreach my $item (@{$json_base->{'container-title'}}) {
  65 + my $container_title = $document->createElement( 'container-title' );
  66 + $container_title->addChild($document->createTextNode($item));
  67 + $container_titles->addChild($container_title);
  68 +}
  69 +$root->addChild($container_titles);
  70 +
  71 +my $URL = $document->createElement( 'URL' );
  72 +$URL->addChild($document->createTextNode($json_base->{URL}));
  73 +$root->addChild($URL);
  74 +
  75 +my $DOI = $document->createElement( 'DOI' );
  76 +$DOI->addChild($document->createTextNode($json_base->{DOI}));
  77 +$root->addChild($DOI);
  78 +
  79 +
  80 +my $subjects = $document->createElement( 'subjects' );
  81 +foreach my $item (@{$json_base->{subject}}) {
  82 + my $subject = $document->createElement( 'subject' );
  83 + $subject->addChild($document->createTextNode($item));
  84 + $subjects->addChild($subject);
  85 +}
  86 +$root->addChild($subjects);
  87 +
  88 +my $ISSNs = $document->createElement( 'ISSNs' );
  89 +foreach my $item (@{$json_base->{ISSN}}) {
  90 + my $ISSN = $document->createElement( 'ISSN' );
  91 + $ISSN->addChild($document->createTextNode($item));
  92 + $ISSNs->addChild($ISSN);
  93 +}
  94 +$root->addChild($ISSNs);
  95 +
  96 +my $publisher = $document->createElement( 'publisher' );
  97 +$publisher->addChild($document->createTextNode($json_base->{publisher}));
  98 +$root->addChild($publisher);
  99 +
  100 +my $authors = $document->createElement( 'authors' );
  101 +foreach my $item (@{$json_base->{author}}) {
  102 + my $author = $document->createElement( 'author' );
  103 +
  104 + my $family = $document->createElement( 'family' );
  105 + $family->addChild($document->createTextNode($item->{'family'}));
  106 +
  107 + my $given = $document->createElement( 'given' );
  108 + $given->addChild($document->createTextNode($item->{'given'}));
  109 +
  110 + #print Dumper "<affiliation>".$item->{affiliation}."</affiliation>";
  111 + $author->addChild($family);
  112 + $author->addChild($given);
  113 + $authors->addChild($author);
  114 +}
  115 +$root->addChild($authors);
  116 +
  117 +
  118 +my $issued = $document->createElement( 'issued' );
  119 +foreach my $item (@{$json_base->{issued}->{'date-parts'}}) {
  120 + my $date = $document->createElement( 'date' );
  121 +
  122 + my $year = $document->createElement( 'year' );
  123 + my $month = $document->createElement( 'month' );
  124 + my $day = $document->createElement( 'day' );
  125 +
  126 + if (defined (@{$item}[0])) {
  127 + $year->addChild($document->createTextNode(@{$item}[0]));
  128 + }
  129 + if (defined (@{$item}[1])) {
  130 + $month->addChild($document->createTextNode(@{$item}[1]));
  131 + }
  132 + if (defined (@{$item}[2])) {
  133 + $day->addChild($document->createTextNode(@{$item}[2]));
  134 + }
  135 +
  136 + $date->addChild($year);
  137 + $date->addChild($month);
  138 + $date->addChild($day);
  139 +
  140 + $issued->addChild($date);
  141 +}
  142 +$root->addChild($issued);
  143 +
  144 +
  145 +print $document->toString(2);
  146 +#print $document->toString(0);
  147 +
ingester/scripts/extract_pdftext.pl
@@ -0,0 +1,47 @@ @@ -0,0 +1,47 @@
  1 +#!/perl/bin/perl -w
  2 +
  3 +# First install CAM::PDF, example: cpan install CAM::PDF
  4 +# http://www.cyberciti.biz/faq/converter-pdf-files-to-text-format-command/
  5 +# First install poppler-utils
  6 +
  7 +use strict; # Good practice
  8 +use warnings; # Good practice
  9 +use Data::Dumper qw(Dumper);
  10 +
  11 +
  12 +# (1) quit unless we have the correct number of command-line args
  13 +my $num_args = $#ARGV + 1;
  14 +if ($num_args != 1) {
  15 + print STDERR "\nUsage: extract_pdftext.pl path/to/filename.pdf\n";
  16 + exit;
  17 +}
  18 +
  19 +my $filename=$ARGV[0];
  20 +unless (-e $filename) {
  21 + print STDERR "\nFile: $filename doesn't exist!\n";
  22 + exit;
  23 +}
  24 +
  25 +
  26 +# read only the first page to STDOUT
  27 +open ( my $fh, "-|","pdftotext -l 1 $filename -") or
  28 + die "error extracting $filename";
  29 +
  30 +
  31 +my @DOIS;
  32 +if (open(my $fh, "-|","pdftotext -l 1 $filename -")) {
  33 + while (my $row = <$fh>) {
  34 + chomp $row;
  35 + #print "$row\n";
  36 + @DOIS = ($row =~ m/\b(10[.][0-9]{4,}(?:[.][0-9]+)*\/(?:(?!["&\'<>])\S)+)\b/g);
  37 + if (scalar @DOIS ne 0) {
  38 + last;
  39 + }
  40 + }
  41 +}
  42 +
  43 +#TODO: this is reading only the first element found
  44 +if (scalar @DOIS ne 0) {
  45 + print STDERR "$DOIS[0] - matches the DOI pattern\n";
  46 + print "$DOIS[0]";
  47 +}
ingester/scripts/map_metadata.pl
@@ -0,0 +1,56 @@ @@ -0,0 +1,56 @@
  1 +#!/usr/bin/perl -w
  2 +
  3 +use strict; # Good practice
  4 +use warnings; # Good practice
  5 +use Data::Dumper; # Perl core module
  6 +use XML::LibXML;
  7 +use XML::LibXSLT;
  8 +use File::Spec;
  9 +
  10 +#requires packages libxml-libxslt-perl, libxml-xpath-perl, libxml-perl
  11 +
  12 +# (1) quit unless we have the correct number of command-line args
  13 +my $num_args = $#ARGV + 1;
  14 +if ($num_args < 1 and $num_args > 2) {
  15 + print STDERR "\nUsage: map_metadata.pl path/to/filename.xml [dc|degois]\n";
  16 + exit;
  17 +}
  18 +
  19 +
  20 +my $filename=$ARGV[0];
  21 +unless (-e $filename) {
  22 + print STDERR "\nFile: $filename doesn't exist!\n";
  23 + exit;
  24 +}
  25 +
  26 +my $schema = 'dc'; # default schema
  27 +if (defined($ARGV[1])) {
  28 + $schema=$ARGV[1];
  29 +}
  30 +
  31 +my @available_schemas = qw(dc degois);
  32 +
  33 +# if schema is in available schemas array
  34 +if (! grep { $_ eq $schema} @available_schemas ) {
  35 + die "$schema isn't a valid schema";
  36 +}
  37 +
  38 +
  39 +my ($volume, $directory, $file) = File::Spec->splitpath(__FILE__);
  40 +
  41 +unless (-e $directory."../xslt/$schema-map.xslt") {
  42 + print STDERR "\nFile: $schema-map.xslt doesn't exist!\n";
  43 + exit;
  44 +}
  45 +
  46 +XML::LibXSLT->max_depth(1000);
  47 +
  48 +my $xslt = XML::LibXSLT->new();
  49 +
  50 +
  51 +my $source = XML::LibXML->load_xml(location => $filename);
  52 +my $style_doc = XML::LibXML->load_xml(location=>$directory."../xslt/$schema-map.xslt", no_cdata=>1);
  53 +my $stylesheet = $xslt->parse_stylesheet($style_doc);
  54 +my $results = $stylesheet->transform($source);
  55 +
  56 +print $stylesheet->output_as_bytes($results);
ingester/setup.sh
@@ -0,0 +1,41 @@ @@ -0,0 +1,41 @@
  1 +#!/bin/bash
  2 +
  3 +ARCH=$(uname -m | sed 's/x86_//;s/i[3-6]86/32/')
  4 +
  5 +if [ -f /etc/lsb-release ]; then
  6 + . /etc/lsb-release
  7 + OS=$DISTRIB_ID
  8 + VER=$DISTRIB_RELEASE
  9 +elif [ -f /etc/debian_version ]; then
  10 + OS=Debian # XXX or Ubuntu??
  11 + VER=$(cat /etc/debian_version)
  12 +elif [ -f /etc/redhat-release ]; then
  13 + OS=Redhat # XXX or Centos??
  14 + VER=$(cat /etc/redhat-release)
  15 +else
  16 + OS=$(uname -s)
  17 + VER=$(uname -r)
  18 +fi
  19 +
  20 +echo "install poppler-utils (pdftotext)"
  21 +if ("$OS" == "Redhat"); then
  22 + yum install poppler-utils
  23 +elif ("$OS" == "Debian"); then
  24 + sudo apt-get install poppler-utils
  25 +else
  26 + echo "unsupported distro"
  27 + exit
  28 +fi
  29 +
  30 +echo "install cpanm"
  31 +wget -O - http://cpanmin.us | perl - --self-upgrade
  32 +
  33 +
  34 +echo "install required libs"
  35 +cpanm Data::Dumper
  36 +cpanm XML::LibXML
  37 +cpanm XML::LibXSLT
  38 +cpanm File::Spec
  39 +cpanm JSON::Parse
  40 +
  41 +echo "instalation complete"
ingester/xslt/dc-map.xslt
@@ -0,0 +1,80 @@ @@ -0,0 +1,80 @@
  1 +<?xml version="1.0" encoding="utf-8"?>
  2 +
  3 +<xsl:stylesheet
  4 + xmlns:xml="http://www.w3.org/XML/1998/namespace"
  5 + xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
  6 + version="1.0">
  7 +<xsl:output method="xml" version="1.0" encoding="UTF-8" indent="yes"/>
  8 +<xsl:template match="/">
  9 +<dublin_core>
  10 + <xsl:for-each select="/crossref/titles/title">
  11 + <dcvalue element="title" qualifier="none"><xsl:value-of select="text()"/></dcvalue>
  12 + </xsl:for-each>
  13 +
  14 + <xsl:for-each select="/crossref/issued/date">
  15 + <dcvalue element="date" qualifier="issued">
  16 + <xsl:value-of select="year"/>
  17 + <xsl:if test="month != ''">
  18 + <xsl:text>-</xsl:text>
  19 + <xsl:value-of select="month"/>
  20 + <xsl:if test="day != ''">
  21 + <xsl:text>-</xsl:text>
  22 + <xsl:value-of select="day"/>
  23 + </xsl:if>
  24 + </xsl:if>
  25 + </dcvalue>
  26 + </xsl:for-each>
  27 +
  28 + <xsl:for-each select="/crossref/container-titles/container-title">
  29 + <dcvalue element="source" qualifier="none"><xsl:value-of select="text()"/></dcvalue>
  30 + </xsl:for-each>
  31 + <xsl:for-each select="/crossref/publisher">
  32 + <dcvalue element="publisher" qualifier="none"><xsl:value-of select="text()"/></dcvalue>
  33 + </xsl:for-each>
  34 +
  35 + <xsl:for-each select="/crossref/authors/author">
  36 + <dcvalue element="contributor" qualifier="author">
  37 + <xsl:value-of select="family"/>
  38 + <xsl:text>, </xsl:text>
  39 + <xsl:value-of select="given"/></dcvalue>
  40 + </xsl:for-each>
  41 +
  42 + <xsl:for-each select="/crossref/DOI">
  43 + <dcvalue element="identifier" qualifier="none"><xsl:value-of select="text()"/></dcvalue>
  44 + </xsl:for-each>
  45 +
  46 + <xsl:for-each select="/crossref/ISSNs/ISSN">
  47 + <dcvalue element="identifier" qualifier="issn"><xsl:value-of select="text()"/></dcvalue>
  48 + </xsl:for-each>
  49 +
  50 + <xsl:for-each select="/crossref/ISBNs/ISBN">
  51 + <dcvalue element="identifier" qualifier="isbn"><xsl:value-of select="text()"/></dcvalue>
  52 + </xsl:for-each>
  53 +
  54 + <xsl:for-each select="/crossref/type">
  55 + <dcvalue element="type" qualifier="none"><xsl:value-of select="text()"/></dcvalue>
  56 + </xsl:for-each>
  57 +
  58 + <xsl:for-each select="/crossref/subjects/subject">
  59 + <dcvalue element="subject" qualifier="none"><xsl:value-of select="text()"/></dcvalue>
  60 + </xsl:for-each>
  61 +
  62 + <xsl:for-each select="/crossref/editors/editor">
  63 + <dcvalue element="contributor" qualifier="editor"><xsl:value-of select="text()"/></dcvalue>
  64 + </xsl:for-each>
  65 + <xsl:for-each select="/crossref/translators/translator">
  66 + <dcvalue element="contributor" qualifier="other"><xsl:value-of select="text()"/></dcvalue>
  67 + </xsl:for-each>
  68 + <xsl:for-each select="/crossref/chairs/chair">
  69 + <dcvalue element="contributor" qualifier="other"><xsl:value-of select="text()"/></dcvalue>
  70 + </xsl:for-each>
  71 +
  72 +</dublin_core>
  73 +
  74 +
  75 + </xsl:template>
  76 +
  77 +</xsl:stylesheet>
  78 +
  79 +
  80 +
ingester/xslt/degois-map.xslt
@@ -0,0 +1,30 @@ @@ -0,0 +1,30 @@
  1 +<?xml version="1.0" encoding="utf-8"?>
  2 +
  3 +<xsl:stylesheet
  4 + xmlns:xml="http://www.w3.org/XML/1998/namespace"
  5 + xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
  6 + version="1.0">
  7 +
  8 + <xsl:output method="xml" version="1.0" encoding="UTF-8" indent="yes"/>
  9 + <xsl:template match="/">
  10 +
  11 + <dublin_core schema="degois">
  12 + <xsl:for-each select="/crossref/firstPage">
  13 + <dcvalue element="publication" qualifier="firstPage"><xsl:value-of select="text()"/></dcvalue>
  14 + </xsl:for-each>
  15 + <xsl:for-each select="/crossref/lastPage">
  16 + <dcvalue element="publication" qualifier="lastPage"><xsl:value-of select="text()"/></dcvalue>
  17 + </xsl:for-each>
  18 + <xsl:for-each select="/crossref/container-titles/container-title">
  19 + <dcvalue element="publication" qualifier="title"><xsl:value-of select="text()"/></dcvalue>
  20 + </xsl:for-each>
  21 + <xsl:for-each select="/crossref/volume">
  22 + <dcvalue element="publication" qualifier="volume"><xsl:value-of select="text()"/></dcvalue>
  23 + </xsl:for-each>
  24 + <xsl:for-each select="/crossref/issue">
  25 + <dcvalue element="publication" qualifier="issue"><xsl:value-of select="text()"/></dcvalue>
  26 + </xsl:for-each>
  27 + </dublin_core>
  28 +
  29 + </xsl:template>
  30 +</xsl:stylesheet>