#!/bin/bash

# Specify the number of articles to download
limit=10

# Fetch the list of articles with metadata in XML format
response=$(curl -s "https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?format=pdf&limit=$limit")

# Parse each record in the response
echo "$response" | while read -r line; do
    # Extract the PMC ID
    if [[ $line =~ id=\"(PMC[0-9]+)\" ]]; then
        pmc_id="${BASH_REMATCH[1]}"
        echo "Processing article ID: $pmc_id"

        # Extract the title for metadata
        title=$(echo "$response" | sed -n "/<record id=\"$pmc_id\"/,/<\/record>/p" | sed -n 's/.*citation="\(.*\)".*/\1/p')

        # Extract the PDF link for download
        pdf_link=$(echo "$response" | sed -n "/<record id=\"$pmc_id\"/,/<\/record>/p" | sed -n 's/.*<link format="pdf"[^>]* href="\([^"]*\)".*/\1/p')

        # Check if we found a PDF link
        if [[ -n $pdf_link ]]; then
            # Print metadata
            echo "Title: $title"
            echo "Downloading PDF from: $pdf_link"
            
            # Download the PDF
            curl -O "$pdf_link"
            
            # Optional: Save metadata to a file
            echo "Title: $title" >> metadata.txt
            echo "PDF Link: $pdf_link" >> metadata.txt
            echo "---------------------" >> metadata.txt
        else
            echo "No PDF link found for article ID: $pmc_id"
        fi
    fi
done