working on an app https://share.streamlit.io/carrlucy/hsl_oa/main that recurses the europmc database looking for open data and the restful api that is provided includes a "nextcursormark" field so that queries can do pagination...
i'm stumbling with how to handle that information and would appreciate ideas?
i know that the variable i'm looking for is stored in the parsed variable of root[2]
the following works to get the first set of results (root[4] is the xml element tree that provides the data for the other for loops, and i need to wrap that in another loop i think to sort out so that every time it sees another nextcursormark value it recreates a new element tree that is then parsed by the following for loop? also concerned that my code isn't done such that this will be simple? so if there's something there that doesn't make sense i'd appreciate ideas there too?
'''
import math
import pandas as pd
import streamlit as st
import numpy as np
import json
import xml.etree.ElementTree as ET
import urllib.request
import rdflib
import altair as alt
from urllib.request import urlopen
from xml.etree.ElementTree import parse
"""
# Europe PMC Open Data Dashboard
"""
builtQuery=('https://www.ebi.ac.uk/europepmc/webservices/rest/search?query=virginia&resultType=core&cursorMark=*&pageSize=50&format=xml')
#https://www.foxinfotech.in/2019/04/python-how-to-read-xml-from-url.html
restQuery=urlopen(builtQuery)
xmlTree=ET.parse(restQuery)
root = xmlTree.getroot()
#https://towardsdatascience.com/converting-multi-layered-xml-files-to-dataframes-in-python-using-xmltree-a13f9b043b48
openAccess=[]
authors=[]
date=[]
title=[]
iso=[]
doi=[]
nextPage=root[2].text
if int(root[1].text)<1000:
for a in root[4]:
root1=ET.Element('result')
root1=a
for b in root1.iter('isOpenAccess'):
root2=ET.Element('root')
for c in root1.iter('authorString'):
root3=ET.Element('root2')
for d in root1.iter('firstPublicationDate'):
root4=ET.Element('root3')
for e in root1.iter('title'):
root5=ET.Element('root4')
for f in root1.iter('ISOAbbreviation'):
root6=ET.Element('root5')
for g in root1.iter('doi'):
root7=ET.Element('root6')
openAccess.append(b.text)
authors.append(c.text)
date.append(d.text)
title.append(e.text)
iso.append(f.text)
doi.append(g.text)
df = pd.DataFrame({'Authors':authors,'ArticleTitle':title,'JournalTitle':iso,'date':date,'DOI':doi,'openAccess': openAccess})
df['date'] = pd.to_datetime(df['date'])
openFilter = sorted(df['openAccess'].drop_duplicates()) # select the open access values
open_Filter = st.sidebar.selectbox('Open Access?', openFilter) # render the streamlit widget on the sidebar of the page using the list we created above for the menu
df2=df[df['openAccess'].str.contains(open_Filter)] # create a dataframe filtered below
st.write(df2.sort_values(by='date'))
df['year']=df['date'].dt.to_period('Y')
df['yearDate'] = df['year'].astype(str)
df3 = df[['yearDate','openAccess']].copy()
valLayer = alt.Chart(df3).mark_bar().encode(x='yearDate',y='count(openAccess)',color='openAccess')
st.altair_chart(valLayer, use_container_width=True)
'''
btw- the i've fixed the URL and the output of that is
'''
<responseWrapper xmlns:slx="http://www.scholix.org" xmlns:epmc="https://www.europepmc.org/data" nighteye="disabled">
<script id="tinyhippos-injected"/>
<version>6.5</version>
<hitCount>277624</hitCount>
<nextCursorMark>AoIIQJRo5Sg0MzQwNzg5MQ==</nextCursorMark>
<request>
<queryString>virginia</queryString>
<resultType>core</resultType>
<cursorMark>*</cursorMark>
<pageSize>50</pageSize>
<sort/>
<synonym>false</synonym>
</request>
<resultList>
<result>