Source code for hepdata.modules.inspire_api.parser

# This file is part of HEPData.
# Copyright (C) 2016 CERN.
#
# HEPData is free software; you can redistribute it
# and/or modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 2 of the
# License, or (at your option) any later version.
#
# HEPData is distributed in the hope that it will be
# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with HEPData; if not, write to the
# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
# MA 02111-1307, USA.
#
# In applying this license, CERN does not
# waive the privileges and immunities granted to it by virtue of its status
# as an Intergovernmental Organization or submit itself to any jurisdiction.

"""Functions for parsing the new INSPIRE JSON metadata."""

from copy import deepcopy

parsed_content_defaults = {
    'title': None,
    'doi': None,
    'authors': None,
    'type': [],
    'abstract': 'None',
    'creation_date': None,
    'arxiv_id': None,
    'collaborations': [],
    'keywords': [],
    'journal_info': 'No Journal Information',
    'year': None,
    'subject_area': [],
}


[docs]def get_title(metadata):
    """Get the title of the publication from the first value in list of english translations (if applicable) otherwise from first title in list of titles."""
    title = deepcopy(parsed_content_defaults['title'])
    if 'title_translations' in metadata.keys():
        for title_translation in metadata['title_translations']:
            if title_translation['language'] == 'en':
                title = title_translation['title']
    if title is parsed_content_defaults['title'] and 'titles' in metadata.keys() and len(metadata['titles']) > 0:
        title = metadata['titles'][0]['title']
    return title


[docs]def get_doi(metadata):
    """Get the DOI of the journal publication from the first value in the list of DOIs."""
    doi = deepcopy(parsed_content_defaults['doi'])
    if 'dois' in metadata and len(metadata['dois']) > 0:
        doi = metadata['dois'][0]['value']
    return doi


[docs]def get_authors(metadata):
    """Get the authors of the publication as a list of dictionaries with keys 'affiliation' and 'full_name'."""
    authors = deepcopy(parsed_content_defaults['authors'])
    if 'authors' in metadata.keys():
        authors = [{'affiliation': (author['affiliations'][0]['value'] if 'affiliations' in author.keys() else ''),
                    'full_name': author['full_name']}
                   for author in metadata['authors']]
    return authors


[docs]def get_type(metadata):
    """Get the type of the publication."""
    _type = deepcopy(parsed_content_defaults['type'])
    if 'document_type' in metadata.keys():
        _type = metadata['document_type']
    return _type


[docs]def get_abstract(metadata):
    """Get the abstract of the publication, ideally the one from the arXiv version, otherwise the first one."""
    abstract = deepcopy(parsed_content_defaults['abstract'])
    if 'abstracts' in metadata.keys():
        abstract = metadata['abstracts'][0]['value']
        for _abstract in metadata['abstracts']:
            if 'value' in _abstract.keys() and 'source' in _abstract.keys() and _abstract['source'] == 'arXiv':
                abstract = _abstract['value']
    return abstract


[docs]def get_creation_date(metadata):
    """Get the creation date of the publication, first try to expand the preprint_date, otherwise try legacy_creation_date."""
    creation_date = deepcopy(parsed_content_defaults['creation_date'])
    if 'preprint_date' in metadata.keys():
        creation_date = expand_date(metadata['preprint_date'])
    elif 'legacy_creation_date' in metadata:
        creation_date = metadata['legacy_creation_date']
    return creation_date


[docs]def get_arxiv_id(metadata):
    """Get the arxiv id of the publication from the last value in the list of arxiv eprints."""
    arxiv_id = deepcopy(parsed_content_defaults['arxiv_id'])
    if 'arxiv_eprints' in metadata.keys():
        arxiv_id = 'arXiv:' + metadata['arxiv_eprints'][-1]['value']
    return arxiv_id


[docs]def get_collaborations(metadata):
    """Get the collaborations of the publication as a list."""
    collaborations = deepcopy(parsed_content_defaults['collaborations'])
    if 'collaborations' in metadata:
        collaborations = [collaboration['value'] for collaboration in metadata['collaborations']]
    return collaborations


[docs]def get_keywords(metadata):
    """Get the keywords of the publication."""
    keywords = deepcopy(parsed_content_defaults['keywords'])
    if 'keywords' in metadata.keys():
        keywords = metadata['keywords']
    return keywords


[docs]def get_journal_info(metadata):
    """
    Get the journal information of the publication. Format is 'title volume (year) article page_start-page_end' if at least one of these information is available,
    otherwise attempt to obtain it from 'pubinfo_freetext' or 'publication_info' or 'report_numbers' or 'public_notes'. Defaults to 'No Journal Information'.
    """
    default_journal_info, journal_info = deepcopy(parsed_content_defaults['journal_info']), ''
    if 'publication_info' in metadata:
        if 'journal_title' in metadata['publication_info'][0].keys():
            journal_info += metadata['publication_info'][0]['journal_title'] + ' '
        if 'journal_volume' in metadata['publication_info'][0].keys():
            journal_info += metadata['publication_info'][0]['journal_volume'] + ' '
        if 'year' in metadata['publication_info'][0].keys():
            journal_info += '(' + str(metadata['publication_info'][0]['year']) + ') '
        if 'artid' in metadata['publication_info'][0].keys():
            journal_info += metadata['publication_info'][0]['artid'] + ' '
        if 'page_start' in metadata['publication_info'][0].keys() and 'page_end' in metadata['publication_info'][0].keys():
            journal_info += metadata['publication_info'][0]['page_start'] + "-" + metadata['publication_info'][0]['page_end']
        if journal_info != '':
            journal_info = journal_info.strip()  # trim to remove whitespace
            return journal_info
    if ('publication_info' in metadata and len(metadata['publication_info']) > 0 and type(metadata['publication_info'][0]) is dict and
       'pubinfo_freetext' in metadata['publication_info'][0].keys()):
        journal_info = metadata['publication_info'][0]['pubinfo_freetext']
    elif 'report_numbers' in metadata and len(metadata['report_numbers']) > 0:
        journal_info = metadata['report_numbers'][0]['value']
    elif ('public_notes' in metadata.keys() and any(['value' in public_note.keys() and "Submitted to " in public_note['value'] for public_note in metadata['public_notes']])):
        journal_info = [public_note['value'].replace("Submitted to ", "") for public_note in metadata['public_notes'] if
                        ('value' in public_note.keys() and "Submitted to " in public_note['value'])][0]
    if '. All figures' in journal_info:
        journal_info = journal_info.replace('. All figures', '')
    if journal_info != '':
        return journal_info
    else:
        return default_journal_info


[docs]def get_year(metadata):
    """Get the year of the publication. Try first 'imprints/date', then 'publication_info/year', then 'preprint_date', and finally 'legacy_creation_date'."""
    year = deepcopy(parsed_content_defaults['year'])
    if 'imprints' in metadata.keys() and any(['date' in imprint.keys() and len(imprint['date']) == 4 for imprint in metadata['imprints']]):
        year = [imprint['date'] for imprint in metadata['imprints'] if 'date' in imprint.keys() and len(imprint['date']) == 4][0]
    elif ('publication_info' in metadata and 'year' in metadata['publication_info'][0].keys()):
        year = str(metadata['publication_info'][0]['year'])
    elif 'preprint_date' in metadata.keys():
        year = metadata['preprint_date'].split("-")[0]
    elif 'legacy_creation_date' in metadata:
        year = metadata['legacy_creation_date'].split("-")[0]
    return year


[docs]def get_subject_area(metadata):
    subject_area = deepcopy(parsed_content_defaults['subject_area'])
    if 'arxiv_eprints' in metadata.keys():
        subject_area += metadata['arxiv_eprints'][-1]['categories']
    if ('inspire_categories' in metadata.keys() and len(metadata['inspire_categories']) > 0):
        subject_area += [entry['term'].replace('Experiment-HEP', 'hep-ex').replace('Experiment-Nucl', 'nucl-ex').replace('Theory-Nucl', 'nucl-th') for
                         entry in metadata['inspire_categories'] if 'term' in entry.keys() and entry['term'] != 'Other']
    subject_area = list(set(subject_area))
    return subject_area


[docs]def updated_parsed_content_for_thesis(content, parsed_content):
    parsed_content['dissertation'] = content['metadata']['thesis_info']
    # fix dissertation/institutions -> dissertation/institution if there is only one
    if ('institutions' in parsed_content['dissertation'].keys() and
       len(parsed_content['dissertation']['institutions']) == 1 and
       'name' in parsed_content['dissertation']['institutions'][0]):
        parsed_content['dissertation']['institution'] = parsed_content['dissertation']['institutions'][0]['name']
        parsed_content['dissertation'].pop('institutions')
    # update year with thesis info
    if 'date' in content['metadata']['thesis_info'].keys():
        parsed_content['year'] = content['metadata']['thesis_info']['date']
        if parsed_content['year'] is not None:
            if content['metadata']['legacy_creation_date'][:4] == parsed_content['year']:
                parsed_content['creation_date'] = content['metadata']['legacy_creation_date']
            else:
                parsed_content['creation_date'] = expand_date(parsed_content['year'])
    # fix capitals in dissertation/type
    if 'degree_type' in parsed_content['dissertation'].keys():
        parsed_content['dissertation']['type'] = parsed_content['dissertation'].pop('degree_type').title()
        if parsed_content['dissertation']['type'] == "Phd":
            parsed_content['dissertation']['type'] = "PhD"
    # fix dissertation/defence_date string
    if 'date' in parsed_content['dissertation'].keys():
        parsed_content['dissertation']['defense_date'] = parsed_content['dissertation'].pop('date')
    return parsed_content


[docs]def expand_date(value):
    """
    In the case where the date is not completely
    formed, we need to expand it out.
    so 2012-08 will be 2012-08-01
    and 2012 will be 2012-01-01.
    If nothing, we do nothing.
    """
    if value == '':
        return value

    date_parts = value.split('-')

    if len(date_parts) == 1:
        date_parts.append('01')
    if len(date_parts) == 2:
        date_parts.append('01')
    return "-".join(date_parts)
Source code for hepdata.modules.inspire_api.parser

Navigation

Related Topics