prepare.sh 1.79 KB
#!/bin/bash

# find all PDF files
# find . -type f -name '*.pdf' -exec ./ingest.sh {} \;

while [[ $# > 1 ]]
do
    echo $1
    shift # past argument or value
done

mylogger() {
  echo "$(date +%F\ %H:%M:%S:%N): $@"
}

file=$1
if [ ! -f $file ]; then
    mylogger "[ERROR] couldn't find file - $file"
    exit
fi

filename=$(basename $file)

# Current script dir
pushd `dirname $0` > /dev/null
SCRIPTPATH=`pwd`
popd > /dev/null

mylogger "start extracting DOI from PDF - $file"

DOI=$(perl $SCRIPTPATH/scripts/extract_pdftext.pl $file)

if [ -z "$DOI" ]; then
    mylogger "[ERROR] occurred when extracting the DOI, or it's missing"
    exit
fi

DOI_prefix=$(echo $DOI|cut -d'/' -f1)
DOI_suffix=$(echo $DOI|cut -d'/' -f2)

DOI_FILE=$DOI_prefix.$DOI_suffix


# download json file
#http://api.crossref.org/works/10.4025/reveducfis.v22i3.9976
mylogger "downloading $DOI from crossref"

ERROR_CODE=`echo $(curl --request GET -s -L -D /dev/stdout -o /tmp/$DOI_FILE.json http://api.crossref.org/works/$DOI)| head -n 1 | cut -d$' ' -f2`


if [ "$ERROR_CODE" != "200" ]; then
    mylogger "[ERROR] crossref didn't found DOI - $DOI"
    exit    
fi


mylogger "extracting metadata from crossref file"
# extract metadata from file
perl $SCRIPTPATH/scripts/extract_metadata.pl /tmp/$DOI_FILE.json 1>/tmp/$DOI_FILE.xml

# Prepare
mylogger "creating folder ./item_$DOI_FILE"
mkdir -p item_$DOI_FILE

mylogger "mapping data"
perl $SCRIPTPATH/scripts/map_metadata.pl /tmp/$DOI_FILE.xml dc 1> item_$DOI_FILE/dublin_core.xml
perl $SCRIPTPATH/scripts/map_metadata.pl /tmp/$DOI_FILE.xml degois 1> item_$DOI_FILE/metadata_degois.xml

mylogger "copying file"
echo "$filename"  > item_$DOI_FILE/contents
yes | cp -rf $file item_$DOI_FILE/$filename


mylogger "cleaning tempfiles"
rm -rf /tmp/$DOI_FILE.json
rm -rf /tmp/$DOI_FILE.xml

mylogger "ended"