Exercises based on tutorial: https://github.com/eseiver/xml_tutorial
"10.1371/journal."
, like "10.1371/journal.pone.0185809"
"journal.pone.0185809.xml"
from allofplos import Article # if have run `pip install allofplos`
# from article_class import Article # if inside cloned GitHub directory
# first instantiation of Article class by DOI
article = Article('10.1371/journal.pone.0178690')
article.title
'Physician assessments of drug seeking behavior: A mixed methods study'
# first instantiation of Article class by filename
article = Article.from_filename('allofplos_xml/journal.pone.0181748.xml')
article.title
'THPdb: Database of FDA-approved peptide and protein therapeutics'
# new article
article.doi = '10.1371/journal.pone.0183591'
article.title
'A checklist is associated with increased quality of reporting preclinical biomedical research: A systematic review'
Try printing or returning some of these values
article.doi
'10.1371/journal.pone.0183591'
article.journal
'PLOS ONE'
article.pubdate
datetime.datetime(2017, 9, 13, 0, 0)
article.title
'A checklist is associated with increased quality of reporting preclinical biomedical research: A systematic review'
article.counts
{'fig-count': '3', 'page-count': '14', 'table-count': '2'}
article.word_count
4954
print(article.abstract[:310])
Irreproducibility of preclinical biomedical research has gained recent attention. It is suggested that requiring authors to complete a checklist at the time of manuscript submission would improve the quality and transparency of scientific reporting, and ultimately enhance reproducibility. Whether a checklist
contributor = article.contributors[0]
contributor.keys()
dict_keys(['contrib_initials', 'given_names', 'surname', 'group_name', 'ids', 'rid_dict', 'contrib_type', 'author_type', 'editor_type', 'email', 'affiliations', 'author_roles', 'footnotes'])
article.authors[0]
{'affiliations': ['Division of Pulmonary, Allergy, and Critical Care Medicine, Department of Medicine, University of Pittsburgh, Pittsburgh, Pennsylvania, United States of America'], 'author_roles': {'CASRAI CREDiT taxonomy': ['Conceptualization', 'Data curation', 'Formal analysis', 'Funding acquisition', 'Investigation', 'Writing – original draft', 'Writing – review & editing']}, 'author_type': 'corresponding', 'contrib_initials': 'SH', 'contrib_type': 'author', 'editor_type': None, 'email': ['shan.workmd@gmail.com'], 'footnotes': ['Current address: Division of Pulmonary and Critical Care, Department of Medicine, Northwestern University, Chicago, Illinois, United States of America'], 'given_names': 'SeungHye', 'group_name': None, 'ids': [{'authenticated': 'true', 'id': 'http://orcid.org/0000-0001-5625-6337', 'id_type': 'orcid'}], 'rid_dict': {'aff': ['aff001'], 'corresp': ['cor001'], 'fn': ['currentaff001']}, 'surname': 'Han'}
article.corr_author
[{'affiliations': ['Division of Pulmonary, Allergy, and Critical Care Medicine, Department of Medicine, University of Pittsburgh, Pittsburgh, Pennsylvania, United States of America'], 'author_roles': {'CASRAI CREDiT taxonomy': ['Conceptualization', 'Data curation', 'Formal analysis', 'Funding acquisition', 'Investigation', 'Writing – original draft', 'Writing – review & editing']}, 'author_type': 'corresponding', 'contrib_initials': 'SH', 'contrib_type': 'author', 'editor_type': None, 'email': ['shan.workmd@gmail.com'], 'footnotes': ['Current address: Division of Pulmonary and Critical Care, Department of Medicine, Northwestern University, Chicago, Illinois, United States of America'], 'given_names': 'SeungHye', 'group_name': None, 'ids': [{'authenticated': 'true', 'id': 'http://orcid.org/0000-0001-5625-6337', 'id_type': 'orcid'}], 'rid_dict': {'aff': ['aff001'], 'corresp': ['cor001'], 'fn': ['currentaff001']}, 'surname': 'Han'}]
article.editor[0]
{'affiliations': ['Fraunhofer Research Institution of Marine Biotechnology, GERMANY'], 'author_roles': {None: ['Editor']}, 'author_type': None, 'contrib_initials': 'JB', 'contrib_type': 'editor', 'editor_type': None, 'email': None, 'footnotes': [], 'given_names': 'Johannes', 'group_name': None, 'ids': [], 'rid_dict': {'aff': ['edit1']}, 'surname': 'Boltze'}
article.type_ # JATS
'research-article'
article.plostype
'Research Article'
article.proof # whether an uncorrected proof/early version or not
article.filename
'/Users/Elizabeth/PLOS_Corpus_Project/allofplos/allofplos/allofplos_xml/journal.pone.0183591.xml'
article.local
True
article.tree
<lxml.etree._ElementTree at 0x10efa9dc8>
article.root
<Element article at 0x1118cf348>
article.xml
article.get_dates()
{'accepted': datetime.datetime(2017, 8, 7, 0, 0), 'collection': datetime.datetime(2017, 1, 1, 0, 0), 'epub': datetime.datetime(2017, 9, 13, 0, 0), 'received': datetime.datetime(2017, 3, 19, 0, 0)}
article.check_if_doi_resolves()
'works'
article
DOI: 10.1371/journal.pone.0183591 Title: A checklist is associated with increased quality of reporting preclinical biomedical research: A systematic review
print(article)
<article-title>Why Most Published Research Findings Are False</article-title>
element.tag
'article-title'
element.text
'Why Most Published Research Findings Are False'
<alt-title alt-title-type="running-head">Essay</alt-title>
element.tag
'alt-title'
element.text
'Essay'
element.attrib
{'alt-title-type': 'running-head'}
# for any text that comes directly after closing tag and before another tag
element.tail
XML(element)
<alt-title alt-title-type="running-head">Essay</alt-title>
element.attrib
{'alt-title-type': 'running-head'}
element.attrib['alt-title-type']
'running-head'
element.attrib.get('alt-title-type')
'running-head'
<title-group>
<article-title>Why Most Published Research Findings Are False</article-title>
<alt-title alt-title-type="running-head">Essay</alt-title>
</title-group>
# to find direct descendants; don't need to know their tags
element.getchildren()
[<Element article-title at 0x10ab16888>, <Element alt-title at 0x1087d06c8>]
new_element = element.getchildren()[0]
new_element.tag
'article-title'
# to find direct ancestor; don't need to know its tag
new_element.getparent()
<Element title-group at 0x10ab16588>
<license xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xlink:href="http://creativecommons.org/licenses/by/4.0/" xlink:type="simple">
<license-p>This is an open access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="http://creativecommons.org/licenses/by/4.0/" xlink:type="simple">Creative Commons Attribution License</ext-link>, which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.</license-p>
</license>
# search direct descendants by name
license.xpath('./license-p')
[<Element license-p at 0x10ab47488>]
# search descendants of direct descendants
license.xpath('./license-p/ext-link')
[<Element ext-link at 0x10ab445c8>]
# search ALL descendants
license.xpath('.//ext-link')
[<Element ext-link at 0x10ab445c8>]
<contrib-group>
<contrib contrib-type="author">
<name name-style="western">
<surname>dos Santos</surname>
<given-names>Renato Vieira</given-names>
</name>
</contrib>
<contrib contrib-type="author">
<name name-style="western">
<surname>da Silva</surname>
<given-names>Linaena Mericy</given-names>
</name>
</contrib>
</contrib-group>
element.xpath('./contrib')
[<Element contrib at 0x10ab44608>, <Element contrib at 0x10ab4d508>]
<license xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xlink:href="http://creativecommons.org/licenses/by/4.0/" xlink:type="simple">
<license-p>This is an open access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="http://creativecommons.org/licenses/by/4.0/" xlink:type="simple">Creative Commons Attribution License</ext-link>, which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.</license-p>
</license>
print(license.text)
None
import lxml.etree as et
license_text = et.tostring(license, method='text', encoding='unicode')
print(license_text)
This is an open access article distributed under the terms of the Creative Commons Attribution License, which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.
<contrib contrib-type="author" equal-contrib="yes"> <name name-style="western"> <surname>Chen</surname> <given-names>Ximing</given-names> </name>plossy </contrib>
element.tag: contrib element.attrib: {'contrib-type': 'author', 'equal-contrib': 'yes'}
<contrib contrib-type="author" equal-contrib="yes"> <name name-style="western"> <surname>Chen</surname> <given-names>Ximing</given-names> </name>plossy </contrib>
new_element = element.xpath('./name')[0]
new_element.tag: name new_element.attrib: {'name-style': 'western'} new_element.tail: plossy
from allofplos import Article
doi = '10.1371/journal.pone.0183591'
article = Article(doi)
article.filename
'/Users/Elizabeth/PLOS_Corpus_Project/allofplos/allofplos/allofplos_xml/journal.pone.0183591.xml'
# Is the article XML file locally stored?
article.local
True
tree = article.tree
tree.xpath('./body')
[<Element body at 0x10b176a48>]
xml_root = article.root
xml_root.xpath('.//license')
[<Element license at 0x10ab58488>]
license = xml_root.xpath('.//license')[0]
license.attrib
{'{http://www.w3.org/1999/xlink}href': 'http://creativecommons.org/licenses/by/4.0/', '{http://www.w3.org/1999/xlink}type': 'simple'}
methods_sections = xml_root.xpath("//sec[@sec-type='materials|methods']")
print(methods_sections)
[<Element sec at 0x10b176b48>]
from allofplos.samples.corpus_analysis import get_random_list_of_dois
from allofplos.article_class import Article
import lxml.etree as et
# First get list of articles/DOIs
dois = get_random_list_of_dois(count=50)
pcr_list = []
# Initialize first article object
article = Article(dois[0])
for doi in dois:
# Step 1: create new article object
article.doi = doi
xml_root = article.root
# Step 2: find Method sections
methods_sections = xml_root.xpath("//sec[@sec-type='materials|methods']")
if not methods_sections:
methods_sections = xml_root.xpath("//sec[@sec-type='methods']")
for sec in methods_sections:
# Step 3: turn the method sections into strings
method_string = et.tostring(sec, method='text', encoding='unicode')
# Step 4: add DOI if 'PCR' in string
if 'PCR' in method_string:
pcr_list.append(article.doi)
break
else:
pass
print(pcr_list[0:5])
['10.1371/journal.pone.0128195', '10.1371/journal.pone.0165464', '10.1371/journal.pone.0136574', '10.1371/journal.pone.0072749', '10.1371/journal.pone.0060101']