Source code for hepdata.modules.inspire_api.parser
# This file is part of HEPData.
# Copyright (C) 2016 CERN.
#
# HEPData is free software; you can redistribute it
# and/or modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 2 of the
# License, or (at your option) any later version.
#
# HEPData is distributed in the hope that it will be
# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with HEPData; if not, write to the
# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
# MA 02111-1307, USA.
#
# In applying this license, CERN does not
# waive the privileges and immunities granted to it by virtue of its status
# as an Intergovernmental Organization or submit itself to any jurisdiction.
"""Functions for parsing the new INSPIRE JSON metadata."""
from copy import deepcopy
parsed_content_defaults = {
'title': None,
'doi': None,
'authors': None,
'type': [],
'abstract': 'None',
'creation_date': None,
'arxiv_id': None,
'collaborations': [],
'keywords': [],
'journal_info': 'No Journal Information',
'year': None,
'subject_area': [],
}
[docs]def get_title(metadata):
"""Get the title of the publication from the first value in list of english translations (if applicable) otherwise from first title in list of titles."""
title = deepcopy(parsed_content_defaults['title'])
if 'title_translations' in metadata.keys():
for title_translation in metadata['title_translations']:
if title_translation['language'] == 'en':
title = title_translation['title']
if title is parsed_content_defaults['title'] and 'titles' in metadata.keys() and len(metadata['titles']) > 0:
title = metadata['titles'][0]['title']
return title
[docs]def get_doi(metadata):
"""Get the DOI of the journal publication from the first value in the list of DOIs."""
doi = deepcopy(parsed_content_defaults['doi'])
if 'dois' in metadata and len(metadata['dois']) > 0:
doi = metadata['dois'][0]['value']
return doi
[docs]def get_authors(metadata):
"""Get the authors of the publication as a list of dictionaries with keys 'affiliation' and 'full_name'."""
authors = deepcopy(parsed_content_defaults['authors'])
if 'authors' in metadata.keys():
authors = [{'affiliation': (author['affiliations'][0]['value'] if 'affiliations' in author.keys() else ''),
'full_name': author['full_name']}
for author in metadata['authors']]
return authors
[docs]def get_type(metadata):
"""Get the type of the publication."""
_type = deepcopy(parsed_content_defaults['type'])
if 'document_type' in metadata.keys():
_type = metadata['document_type']
return _type
[docs]def get_abstract(metadata):
"""Get the abstract of the publication, ideally the one from the arXiv version, otherwise the first one."""
abstract = deepcopy(parsed_content_defaults['abstract'])
if 'abstracts' in metadata.keys():
abstract = metadata['abstracts'][0]['value']
for _abstract in metadata['abstracts']:
if 'value' in _abstract.keys() and 'source' in _abstract.keys() and _abstract['source'] == 'arXiv':
abstract = _abstract['value']
return abstract
[docs]def get_creation_date(metadata):
"""Get the creation date of the publication, first try to expand the preprint_date, otherwise try legacy_creation_date."""
creation_date = deepcopy(parsed_content_defaults['creation_date'])
if 'preprint_date' in metadata.keys():
creation_date = expand_date(metadata['preprint_date'])
elif 'legacy_creation_date' in metadata:
creation_date = metadata['legacy_creation_date']
return creation_date
[docs]def get_arxiv_id(metadata):
"""Get the arxiv id of the publication from the last value in the list of arxiv eprints."""
arxiv_id = deepcopy(parsed_content_defaults['arxiv_id'])
if 'arxiv_eprints' in metadata.keys():
arxiv_id = 'arXiv:' + metadata['arxiv_eprints'][-1]['value']
return arxiv_id
[docs]def get_collaborations(metadata):
"""Get the collaborations of the publication as a list."""
collaborations = deepcopy(parsed_content_defaults['collaborations'])
if 'collaborations' in metadata:
collaborations = [collaboration['value'] for collaboration in metadata['collaborations']]
return collaborations
[docs]def get_keywords(metadata):
"""Get the keywords of the publication."""
keywords = deepcopy(parsed_content_defaults['keywords'])
if 'keywords' in metadata.keys():
keywords = metadata['keywords']
return keywords
[docs]def get_journal_info(metadata):
"""
Get the journal information of the publication. Format is 'title volume (year) article page_start-page_end' if at least one of these information is available,
otherwise attempt to obtain it from 'pubinfo_freetext' or 'publication_info' or 'report_numbers' or 'public_notes'. Defaults to 'No Journal Information'.
"""
default_journal_info, journal_info = deepcopy(parsed_content_defaults['journal_info']), ''
if 'publication_info' in metadata:
if 'journal_title' in metadata['publication_info'][0].keys():
journal_info += metadata['publication_info'][0]['journal_title'] + ' '
if 'journal_volume' in metadata['publication_info'][0].keys():
journal_info += metadata['publication_info'][0]['journal_volume'] + ' '
if 'year' in metadata['publication_info'][0].keys():
journal_info += '(' + str(metadata['publication_info'][0]['year']) + ') '
if 'artid' in metadata['publication_info'][0].keys():
journal_info += metadata['publication_info'][0]['artid'] + ' '
if 'page_start' in metadata['publication_info'][0].keys() and 'page_end' in metadata['publication_info'][0].keys():
journal_info += metadata['publication_info'][0]['page_start'] + "-" + metadata['publication_info'][0]['page_end']
if journal_info != '':
journal_info = journal_info.strip() # trim to remove whitespace
return journal_info
if ('publication_info' in metadata and len(metadata['publication_info']) > 0 and type(metadata['publication_info'][0]) is dict and
'pubinfo_freetext' in metadata['publication_info'][0].keys()):
journal_info = metadata['publication_info'][0]['pubinfo_freetext']
elif 'report_numbers' in metadata and len(metadata['report_numbers']) > 0:
journal_info = metadata['report_numbers'][0]['value']
elif ('public_notes' in metadata.keys() and any(['value' in public_note.keys() and "Submitted to " in public_note['value'] for public_note in metadata['public_notes']])):
journal_info = [public_note['value'].replace("Submitted to ", "") for public_note in metadata['public_notes'] if
('value' in public_note.keys() and "Submitted to " in public_note['value'])][0]
if '. All figures' in journal_info:
journal_info = journal_info.replace('. All figures', '')
if journal_info != '':
return journal_info
else:
return default_journal_info
[docs]def get_year(metadata):
"""Get the year of the publication. Try first 'imprints/date', then 'publication_info/year', then 'preprint_date', and finally 'legacy_creation_date'."""
year = deepcopy(parsed_content_defaults['year'])
if 'imprints' in metadata.keys() and any(['date' in imprint.keys() and len(imprint['date']) == 4 for imprint in metadata['imprints']]):
year = [imprint['date'] for imprint in metadata['imprints'] if 'date' in imprint.keys() and len(imprint['date']) == 4][0]
elif ('publication_info' in metadata and 'year' in metadata['publication_info'][0].keys()):
year = str(metadata['publication_info'][0]['year'])
elif 'preprint_date' in metadata.keys():
year = metadata['preprint_date'].split("-")[0]
elif 'legacy_creation_date' in metadata:
year = metadata['legacy_creation_date'].split("-")[0]
return year
[docs]def get_subject_area(metadata):
subject_area = deepcopy(parsed_content_defaults['subject_area'])
if 'arxiv_eprints' in metadata.keys():
subject_area += metadata['arxiv_eprints'][-1]['categories']
if ('inspire_categories' in metadata.keys() and len(metadata['inspire_categories']) > 0):
subject_area += [entry['term'].replace('Experiment-HEP', 'hep-ex').replace('Experiment-Nucl', 'nucl-ex').replace('Theory-Nucl', 'nucl-th') for
entry in metadata['inspire_categories'] if 'term' in entry.keys() and entry['term'] != 'Other']
subject_area = list(set(subject_area))
return subject_area
[docs]def updated_parsed_content_for_thesis(content, parsed_content):
parsed_content['dissertation'] = content['metadata']['thesis_info']
# fix dissertation/institutions -> dissertation/institution if there is only one
if ('institutions' in parsed_content['dissertation'].keys() and
len(parsed_content['dissertation']['institutions']) == 1 and
'name' in parsed_content['dissertation']['institutions'][0]):
parsed_content['dissertation']['institution'] = parsed_content['dissertation']['institutions'][0]['name']
parsed_content['dissertation'].pop('institutions')
# update year with thesis info
if 'date' in content['metadata']['thesis_info'].keys():
parsed_content['year'] = content['metadata']['thesis_info']['date']
if parsed_content['year'] is not None:
if content['metadata']['legacy_creation_date'][:4] == parsed_content['year']:
parsed_content['creation_date'] = content['metadata']['legacy_creation_date']
else:
parsed_content['creation_date'] = expand_date(parsed_content['year'])
# fix capitals in dissertation/type
if 'degree_type' in parsed_content['dissertation'].keys():
parsed_content['dissertation']['type'] = parsed_content['dissertation'].pop('degree_type').title()
if parsed_content['dissertation']['type'] == "Phd":
parsed_content['dissertation']['type'] = "PhD"
# fix dissertation/defence_date string
if 'date' in parsed_content['dissertation'].keys():
parsed_content['dissertation']['defense_date'] = parsed_content['dissertation'].pop('date')
return parsed_content
[docs]def expand_date(value):
"""
In the case where the date is not completely
formed, we need to expand it out.
so 2012-08 will be 2012-08-01
and 2012 will be 2012-01-01.
If nothing, we do nothing.
"""
if value == '':
return value
date_parts = value.split('-')
if len(date_parts) == 1:
date_parts.append('01')
if len(date_parts) == 2:
date_parts.append('01')
return "-".join(date_parts)