Commit b28c1b85e05ecd87aa8efac7edccac6d09d91f10

Authored by Paulo Graça
0 parents
Exists in master

Versioning source-code

Readme.md
... ... @@ -0,0 +1,57 @@
  1 +# Extract IDs from ORCID profiles
  2 +
  3 +This tool was develop to extract IDs from ORCID. It was developed in Bash and Perl languages and it uses XSL Transformations for the output result, so it can be used for any other purpose than this.
  4 +
  5 +At this moment the output it's in CSV format that can be opened in OpenOffice, LibreOffice, MS Excel.
  6 +
  7 +Something like this:
  8 +| ORCID | Name | Researcher ID | Scoups author ID |
  9 +| ----- | ---- | ------------- | ---------------- |
  10 +| 0000-0001-5000-0736 | Martim Chichorro | J-3782-2013 | 11540746200 |
  11 +| 0000-0001-5000-8754 | Paulo Nossa | | 23490244800 |
  12 +| 0000-0001-5003-7264 | Mafalda Guimarães | | |
  13 +| 0000-0001-5004-6101 | Maria Luisa Morgado | | 55916371200||55955252400 |
  14 +| 0000-0001-5004-7518 | William Martinez | | |
  15 +| 0000-0001-5005-120X | João Lopes | | 23019476500 |
  16 +| 0000-0001-5005-2405 | Emanuel Gamelas | | |
  17 +| 0000-0001-5005-3598 | Sergio Miguel Leandro | M-4254-2013 | 6603233719 |
  18 +| 0000-0001-5006-3008 | Carla Oliveira | | 55972889000 |
  19 +
  20 +## Instalation
  21 +
  22 +This tool uses Perl. So it's required for Perl to be instaled on your system and also some libXML libraries for parsing and transform XML.
  23 +
  24 +For Ubuntu systems:
  25 +```
  26 +sudo apt-get install libxml-libxslt-perl, libxml-xpath-perl, libxml-perl
  27 +```
  28 +
  29 +## How to use
  30 +
  31 +It's split in two parts. The first one downloads files based on input file or argument ORCIDs and the second one, from a directory of orcids, process each one and outputs the result.
  32 +
  33 +```bash
  34 +bash get_orcid_data.sh --file /path/to/orcid_list.txt >> /path/to/output_data.csv
  35 +```
  36 +
  37 +output_data.csv result:
  38 +```
  39 +"0000-0001-5000-0736","Martim Chichorro","J-3782-2013","11540746200"
  40 +"0000-0001-5000-8754","Paulo Nossa","","23490244800"
  41 +"0000-0001-5003-7264","Mafalda Guimarães","",""
  42 +"0000-0001-5004-6101","Maria Luisa Morgado","","55916371200||55955252400"
  43 +"0000-0001-5004-7518","William Martinez","",""
  44 +"0000-0001-5005-120X","João Lopes","","23019476500"
  45 +"0000-0001-5005-2405","Emanuel Gamelas","",""
  46 +"0000-0001-5005-3598","Sergio Miguel Leandro","M-4254-2013","6603233719"
  47 +"0000-0001-5006-3008","Carla Oliveira","","55972889000"
  48 +```
  49 +
  50 +## Other uses
  51 +You can change orcid-map.xslt to output data in other formats. Or change the http://pub.orcid.org/v1.2/${1}/orcid-bio url to affiliations to get the user affiliations or any type of information [provided by ORCID](http://support.orcid.org/knowledgebase/topics/32832-orcid-xml).
  52 +
  53 +
  54 +## Need help, or give any type of contribution?
  55 +
  56 +Please contact us at [FCT|FCCN](http://www.fccn.pt) or any commiter.
  57 +
... ...
get_orcid_data.sh
... ... @@ -0,0 +1,160 @@
  1 +#!/bin/bash
  2 +
  3 +# pgraca: paulo.graca@fccn.pt
  4 +# this script downloads orcids from a source list and then process it and return the parsed information
  5 +
  6 +usage()
  7 +{
  8 +cat <<EOF
  9 +Usage: $(basename $0) --file DIR|--orcid ORCID [options]
  10 +
  11 +this script downloads orcids from a source list and then process it and return the parsed information.
  12 +
  13 +Options:
  14 +
  15 + --file Input file with ORCIDs to process.
  16 +
  17 + --dir Target directory to save download data. Default with will be random generated in /tmp.
  18 +
  19 + --orcid Orcid to download and process. Multiple separated by comma (,)
  20 +
  21 + --nodownload Don't download files.
  22 +
  23 + --noprocess Don't process files.
  24 +
  25 + --verbose Output all messages.
  26 +
  27 +
  28 +Examples:
  29 +
  30 + 1. Download an orcid and process it.
  31 +
  32 + $(basename $0) --file /path/to/file.txt
  33 + OrcID=0000-0002-9081-2728
  34 + Name=Joao Mendes Moreira
  35 + ResearcherID=L-3960-2013
  36 +
  37 + 2. Orcid by argument.
  38 +
  39 + $(basename $0) --orcid "0000-0002-9081-2728"
  40 + OrcID=0000-0002-9081-2728
  41 + Name=Joao Mendes Moreira
  42 + ResearcherID=L-3960-2013
  43 +
  44 +EOF
  45 +}
  46 +
  47 +EXPECTED_ARGS=1
  48 +E_BADARGS=65
  49 +
  50 +if [ $# -lt $EXPECTED_ARGS ]
  51 +then
  52 + usage
  53 + exit 0
  54 +fi
  55 +
  56 +
  57 +while [ "$1" ]; do
  58 + case "$1" in
  59 + --file)
  60 + shift
  61 + FILE="$1"
  62 + source="local"
  63 + ;;
  64 + --dir)
  65 + shift
  66 + TEMP_DIR="$1"
  67 + ;;
  68 + --orcid)
  69 + shift
  70 + IFS=',' read -a ORCIDS <<< "${1}"
  71 + source="orcid"
  72 + ;;
  73 + --nodownload)
  74 + shift
  75 + NO_DOWNLOAD=yes
  76 + ;;
  77 + --noprocess)
  78 + shift
  79 + NO_PROCESS=yes
  80 + ;;
  81 + --verbose)
  82 + shift
  83 + VERBOSE=yes
  84 + ;;
  85 + --help)
  86 + usage
  87 + exit 0
  88 + ;;
  89 + esac
  90 + shift
  91 +done
  92 +
  93 +function download_orcid {
  94 + wget --quiet --output-document=${1}.xml http://pub.orcid.org/v1.2/${1}/orcid-bio
  95 +}
  96 +
  97 +echo $1
  98 +
  99 +if [ -z "$source" ]; then
  100 + echo "--inputfile or --orcid is required."
  101 + usage
  102 + exit 0
  103 +fi
  104 +
  105 +# Current script dir
  106 +pushd `dirname $0` > /dev/null
  107 +SCRIPTPATH=`pwd`
  108 +popd > /dev/null
  109 +
  110 +
  111 +if [ ! -d "$TEMP_DIR" ]; then
  112 + # Control will enter here if $DIRECTORY doesn't exist.
  113 + UUID=$(cat /proc/sys/kernel/random/uuid)
  114 + TEMP_DIR=/tmp/${UUID}
  115 + mkdir -p ${TEMP_DIR}
  116 + [ -n "$VERBOSE" ] && echo "Created directory: ${TEMP_DIR}"
  117 +fi
  118 +cd ${TEMP_DIR}
  119 +
  120 +# Downloading
  121 +if [ "$source" == "local" ] && [ -f "$FILE" ]; then
  122 + if [ -z "$NO_DOWNLOAD" ]; then
  123 +
  124 + [ -n "$VERBOSE" ] && echo "Downloading orcids..."
  125 + while read ORCID
  126 + do
  127 + [ -n "$VERBOSE" ] && echo "Downloading... ${ORCID}"
  128 + # Downloading ORCIDs
  129 + download_orcid $ORCID
  130 + done <$FILE
  131 + fi
  132 +fi
  133 +if [ "$source" == "orcid" ]; then
  134 + if [ -z "$NO_DOWNLOAD" ]; then
  135 +
  136 + [ -n "$VERBOSE" ] && echo "Downloading orcids..."
  137 + for ORCID in "${ORCIDS[@]}"
  138 + do
  139 + [ -n "$VERBOSE" ] && echo "Downloading... ${ORCID}"
  140 + # Downloading ORCIDs
  141 + download_orcid $ORCID
  142 + done
  143 + fi
  144 +fi
  145 +
  146 +# Processing
  147 +if [ -z "$NO_PROCESS" ]; then
  148 + [ -n "$VERBOSE" ] && echo "Processing orcids..."
  149 + for f in $TEMP_DIR/*.xml
  150 + do
  151 + if [ -s "$f" ]; then
  152 + # if file is not empty
  153 + [ -n "$VERBOSE" ] && echo "Processing $f file..."
  154 + # take action on each file. $f store current file name
  155 + perl $SCRIPTPATH/parse_orcid.pl ${f}
  156 + else
  157 + echo "$f is empty" >> $SCRIPTPATH/error.log
  158 + fi
  159 + done
  160 +fi
... ...
orcid-map.xslt
... ... @@ -0,0 +1,42 @@
  1 +<?xml version="1.0" encoding="utf-8"?>
  2 +
  3 +<xsl:stylesheet
  4 + xmlns:xml="http://www.w3.org/XML/1998/namespace"
  5 + xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
  6 + xmlns:orcid="http://www.orcid.org/ns/orcid"
  7 + version="1.0">
  8 +<xsl:output method='text'/>
  9 +<xsl:template match="/">
  10 +
  11 + <xsl:text>"</xsl:text>
  12 + <xsl:value-of select="//orcid:orcid-identifier/orcid:path/text()"/>
  13 + <xsl:text>",</xsl:text>
  14 +
  15 + <xsl:text>"</xsl:text>
  16 + <xsl:value-of select="//orcid:orcid-bio/orcid:personal-details/orcid:given-names/text()"/>
  17 + <xsl:text> </xsl:text>
  18 + <xsl:value-of select="//orcid:orcid-bio/orcid:personal-details/orcid:family-name/text()"/>
  19 + <xsl:text>",</xsl:text>
  20 +
  21 + <xsl:text>"</xsl:text>
  22 + <xsl:variable name="n_ResearcherID" select="count(//orcid:external-identifier/orcid:external-id-common-name[text()='ResearcherID'])"/>
  23 + <xsl:for-each select="//orcid:external-identifier/orcid:external-id-common-name[text()='ResearcherID']/../orcid:external-id-reference">
  24 + <xsl:value-of select="text()"/>
  25 + <xsl:if test="$n_ResearcherID &gt; 1 and position() != last()">
  26 + <xsl:text>||</xsl:text>
  27 + </xsl:if>
  28 + </xsl:for-each>
  29 + <xsl:text>",</xsl:text>
  30 +
  31 + <xsl:text>"</xsl:text>
  32 + <xsl:variable name="n_ScopusID" select="count(//orcid:external-identifier/orcid:external-id-common-name[text()='Scopus Author ID'])"/>
  33 + <xsl:for-each select="//orcid:external-identifier/orcid:external-id-common-name[text()='Scopus Author ID']/../orcid:external-id-reference">
  34 + <xsl:value-of select="text()"/>
  35 + <xsl:if test="$n_ScopusID &gt; 1 and position() != last()">
  36 + <xsl:text>||</xsl:text>
  37 + </xsl:if>
  38 + </xsl:for-each>
  39 + <xsl:text>"&#xa;</xsl:text>
  40 + </xsl:template>
  41 +
  42 +</xsl:stylesheet>
... ...
parse_orcid.pl
... ... @@ -0,0 +1,34 @@
  1 +#!/usr/bin/perl -w
  2 +
  3 +#requires packages libxml-libxslt-perl, libxml-xpath-perl, libxml-perl
  4 +
  5 +# (1) quit unless we have the correct number of command-line args
  6 +$num_args = $#ARGV + 1;
  7 +if ($num_args != 1) {
  8 + print "\nUsage: parse_orcid.pl path/to/filename.xml\n";
  9 + exit;
  10 +}
  11 +
  12 +
  13 +$filename=$ARGV[0];
  14 +unless (-e $filename) {
  15 + print "\nFile: $filename doesn't exist!\n";
  16 + exit;
  17 +}
  18 +
  19 +use XML::LibXML;
  20 +use XML::LibXSLT;
  21 +use File::Spec;
  22 +
  23 +my ($volume, $directory, $file) = File::Spec->splitpath(__FILE__);
  24 +
  25 +XML::LibXSLT->max_depth(1000);
  26 +
  27 +my $xslt = XML::LibXSLT->new();
  28 +
  29 +my $source = XML::LibXML->load_xml(location => $filename);
  30 +my $style_doc = XML::LibXML->load_xml(location=>$directory.'/orcid-map.xslt', no_cdata=>1);
  31 +my $stylesheet = $xslt->parse_stylesheet($style_doc);
  32 +my $results = $stylesheet->transform($source);
  33 +
  34 +print $stylesheet->output_as_bytes($results);
... ...