raannakasturi's picture
Refactor data extraction functions: rename extract_pmc_data to extract_sd_data, add extract_phys_data, and update file handling for new data sources
a4b6d0b
raw
history blame
3.8 kB
import json
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
import requests
import threading
import utils
def fetch_links(link):
links = []
xml_data = utils.fetch_page(link)
items = ET.fromstring(xml_data).findall('channel/item')
for item in items:
link = item.find('link').text
links.append(link)
return links
def fetch_all_links():
category_link_data = {
"Earth": "https://phys.org/rss-feed/breaking/earth-news/",
"Science": "https://phys.org/rss-feed/breaking/science-news/",
"Nano-technology": "https://phys.org/rss-feed/breaking/nanotech-news/",
"Physics": "https://phys.org/rss-feed/breaking/physics-news/",
"Astronomy & Space": "https://phys.org/rss-feed/breaking/space-news/",
"Biology": "https://phys.org/rss-feed/breaking/biology-news/",
"Chemistry": "https://phys.org/rss-feed/breaking/chemistry-news/",
}
sd_links_data = {}
for category, link in category_link_data.items():
links = fetch_links(link)
sd_links_data[category] = links
return json.dumps(sd_links_data, indent=4, ensure_ascii=False)
def fetch_dois():
doi_data = {}
data = json.loads(fetch_all_links())
for topic, links in data.items():
doi_list = []
for link in links:
page_content = utils.fetch_page(link)
page_datas = BeautifulSoup(page_content, 'html.parser').find_all("div",class_="article-main__more")
for page_data in page_datas:
doi_link = page_data.find("a", attrs={"data-doi":"1"})
if doi_link:
doi = doi_link.text.split('DOI: ')[-1]
if doi.startswith('10.'):
doi_list.append(doi)
doi_data[topic] = doi_list
return json.dumps(doi_data, indent=4, ensure_ascii=False)
def fetch_doi_data():
result = []
def fetch_and_store():
result.append(fetch_dois())
thread = threading.Thread(target=fetch_and_store)
thread.start()
thread.join()
return result[0] if result else {}
def doi_to_pmc():
data = json.loads(fetch_doi_data())
pmc_data = {}
for topic, dois in data.items():
if not dois:
continue
doi_list = ",".join(dois)
try:
url = f"https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/[email protected]&ids={doi_list}&format=json"
doi_pmc_data = requests.get(url).json()
if doi_pmc_data['status'] == 'ok':
pmc_list = [record['pmcid'] for record in doi_pmc_data['records'] if 'pmcid' in record and record.get('live', True)]
pmc_data[topic] = pmc_list[:2]
except Exception as e:
print(f"Error: {str(e)}")
return json.dumps(pmc_data, indent=4, ensure_ascii=False)
def extract_phys_data():
if not utils.download_datafile('phys.txt'):
raise Exception("Failed to download datafile")
pmc_data = {}
pmcid_data = json.loads(doi_to_pmc())
for topic, pmcids in pmcid_data.items():
pmc_ids = []
for pmcid in pmcids:
if len(pmc_ids) >= 2:
break
if not utils.check_data_in_file(pmcid, 'phys.txt'):
utils.write_data_to_file(pmcid, 'phys.txt')
pmc_ids.append(pmcid)
pmc_data[topic] = {"ids": pmc_ids, "count": len(pmc_ids)}
if not utils.upload_datafile('phys.txt'):
raise Exception("Failed to upload datafile")
return json.dumps(pmc_data, indent=4, ensure_ascii=False)
if __name__ == "__main__":
data = extract_phys_data()
with open('phys_data.json', 'w') as f:
f.write(data)