Blame view

ingester/prepare.sh 1.79 KB
9d00822b4   Paulo Gra├ža   Initial commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
  #!/bin/bash
  
  # find all PDF files
  # find . -type f -name '*.pdf' -exec ./ingest.sh {} \;
  
  while [[ $# > 1 ]]
  do
      echo $1
      shift # past argument or value
  done
  
  mylogger() {
    echo "$(date +%F\ %H:%M:%S:%N): $@"
  }
  
  file=$1
  if [ ! -f $file ]; then
      mylogger "[ERROR] couldn't find file - $file"
      exit
  fi
  
  filename=$(basename $file)
  
  # Current script dir
  pushd `dirname $0` > /dev/null
  SCRIPTPATH=`pwd`
  popd > /dev/null
  
  mylogger "start extracting DOI from PDF - $file"
  
  DOI=$(perl $SCRIPTPATH/scripts/extract_pdftext.pl $file)
  
  if [ -z "$DOI" ]; then
      mylogger "[ERROR] occurred when extracting the DOI, or it's missing"
      exit
  fi
  
  DOI_prefix=$(echo $DOI|cut -d'/' -f1)
  DOI_suffix=$(echo $DOI|cut -d'/' -f2)
  
  DOI_FILE=$DOI_prefix.$DOI_suffix
  
  
  # download json file
  #http://api.crossref.org/works/10.4025/reveducfis.v22i3.9976
  mylogger "downloading $DOI from crossref"
  
  ERROR_CODE=`echo $(curl --request GET -s -L -D /dev/stdout -o /tmp/$DOI_FILE.json http://api.crossref.org/works/$DOI)| head -n 1 | cut -d$' ' -f2`
  
  
  if [ "$ERROR_CODE" != "200" ]; then
      mylogger "[ERROR] crossref didn't found DOI - $DOI"
      exit    
  fi
  
  
  mylogger "extracting metadata from crossref file"
  # extract metadata from file
  perl $SCRIPTPATH/scripts/extract_metadata.pl /tmp/$DOI_FILE.json 1>/tmp/$DOI_FILE.xml
  
  # Prepare
  mylogger "creating folder ./item_$DOI_FILE"
  mkdir -p item_$DOI_FILE
  
  mylogger "mapping data"
  perl $SCRIPTPATH/scripts/map_metadata.pl /tmp/$DOI_FILE.xml dc 1> item_$DOI_FILE/dublin_core.xml
  perl $SCRIPTPATH/scripts/map_metadata.pl /tmp/$DOI_FILE.xml degois 1> item_$DOI_FILE/metadata_degois.xml
  
  mylogger "copying file"
  echo "$filename"  > item_$DOI_FILE/contents
  yes | cp -rf $file item_$DOI_FILE/$filename
  
  
  mylogger "cleaning tempfiles"
  rm -rf /tmp/$DOI_FILE.json
  rm -rf /tmp/$DOI_FILE.xml
  
  mylogger "ended"