Blame view

ingester/prepare.sh 2.04 KB
9d00822b4   Paulo Graca   Initial commit
1
2
3
  #!/bin/bash
  
  # find all PDF files
025937339   Paulo Graca   PG: add exec path
4
  # time find . -type f -name '*.pdf' -exec ./prepare.sh {} \;
9d00822b4   Paulo Graca   Initial commit
5
6
7
8
9
10
  
  while [[ $# > 1 ]]
  do
      echo $1
      shift # past argument or value
  done
5e6d00105   Paulo Graca   Fixing error mess...
11
12
13
14
  function mylogger() {
      local ERROR_COLOR='\\e[91m'
      local ARG=`echo $@|sed "s|\[ERROR\]|${ERROR_COLOR}\[ERROR\]|g"`
      echo -e "\e[90m$(date +%F\ %H:%M:%S:%N):\e[0m ${ARG}\e[0m"
9d00822b4   Paulo Graca   Initial commit
15
16
17
18
19
20
21
22
23
24
25
26
27
  }
  
  file=$1
  if [ ! -f $file ]; then
      mylogger "[ERROR] couldn't find file - $file"
      exit
  fi
  
  filename=$(basename $file)
  
  # Current script dir
  pushd `dirname $0` > /dev/null
  SCRIPTPATH=`pwd`
15d95c4e9   Paulo Graca   PG: add exec path
28
29
  if [ -d $SCRIPTPATH'ingester' ]; then
      SCRIPTPATH=$SCRIPTPATH'ingester/'
025937339   Paulo Graca   PG: add exec path
30
  fi
9d00822b4   Paulo Graca   Initial commit
31
32
33
  popd > /dev/null
  
  mylogger "start extracting DOI from PDF - $file"
9d00822b4   Paulo Graca   Initial commit
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
  DOI=$(perl $SCRIPTPATH/scripts/extract_pdftext.pl $file)
  
  if [ -z "$DOI" ]; then
      mylogger "[ERROR] occurred when extracting the DOI, or it's missing"
      exit
  fi
  
  DOI_prefix=$(echo $DOI|cut -d'/' -f1)
  DOI_suffix=$(echo $DOI|cut -d'/' -f2)
  
  DOI_FILE=$DOI_prefix.$DOI_suffix
  
  
  # download json file
  #http://api.crossref.org/works/10.4025/reveducfis.v22i3.9976
  mylogger "downloading $DOI from crossref"
  
  ERROR_CODE=`echo $(curl --request GET -s -L -D /dev/stdout -o /tmp/$DOI_FILE.json http://api.crossref.org/works/$DOI)| head -n 1 | cut -d$' ' -f2`
  
  
  if [ "$ERROR_CODE" != "200" ]; then
      mylogger "[ERROR] crossref didn't found DOI - $DOI"
      exit    
  fi
  
  
  mylogger "extracting metadata from crossref file"
  # extract metadata from file
  perl $SCRIPTPATH/scripts/extract_metadata.pl /tmp/$DOI_FILE.json 1>/tmp/$DOI_FILE.xml
  
  # Prepare
  mylogger "creating folder ./item_$DOI_FILE"
  mkdir -p item_$DOI_FILE
  
  mylogger "mapping data"
  perl $SCRIPTPATH/scripts/map_metadata.pl /tmp/$DOI_FILE.xml dc 1> item_$DOI_FILE/dublin_core.xml
  perl $SCRIPTPATH/scripts/map_metadata.pl /tmp/$DOI_FILE.xml degois 1> item_$DOI_FILE/metadata_degois.xml
  
  mylogger "copying file"
  echo "$filename"  > item_$DOI_FILE/contents
5e6d00105   Paulo Graca   Fixing error mess...
74
  yes | cp -prf $file item_$DOI_FILE/$filename
9d00822b4   Paulo Graca   Initial commit
75
76
77
78
79
  
  
  mylogger "cleaning tempfiles"
  rm -rf /tmp/$DOI_FILE.json
  rm -rf /tmp/$DOI_FILE.xml
8802facee   Paulo Graca   logging messages
80
  mylogger "ended......................................"