pubmed_parser/pmc/
oa_api.rs

1//! PMC Open Access API client
2//!
3//! This module provides functionality to check if a PMC article is in the OA subset
4//! using the NCBI OA Web Service API.
5//!
6//! The OA subset contains articles with programmatic access to full-text XML.
7//! Not all PMC articles are in the OA subset - some publishers restrict programmatic access
8//! even though the article may be viewable on the PMC website.
9
10use crate::common::PmcId;
11use crate::error::{ParseError, Result};
12use quick_xml::de::from_str;
13use serde::{Deserialize, Serialize};
14use tracing::debug;
15
16// ================================================================================================
17// OA API Response Structs (for quick-xml deserialization)
18// ================================================================================================
19
20/// Root element of OA API response
21#[derive(Debug, Deserialize)]
22#[serde(rename = "OA")]
23struct OaResponse {
24    #[serde(rename = "error")]
25    error: Option<OaError>,
26    #[serde(rename = "records")]
27    records: Option<OaRecords>,
28}
29
30/// Error element in OA API response
31#[derive(Debug, Deserialize)]
32struct OaError {
33    #[serde(rename = "@code")]
34    code: Option<String>,
35    #[serde(rename = "$text")]
36    message: String,
37}
38
39/// Records container in OA API response
40#[derive(Debug, Deserialize)]
41struct OaRecords {
42    #[serde(rename = "record", default)]
43    record: Vec<OaRecord>,
44}
45
46/// Individual record in OA API response
47#[derive(Debug, Deserialize)]
48struct OaRecord {
49    #[serde(rename = "@id")]
50    _id: Option<String>,
51    #[serde(rename = "@citation")]
52    citation: Option<String>,
53    #[serde(rename = "@license")]
54    license: Option<String>,
55    #[serde(rename = "@retracted")]
56    retracted: Option<String>,
57    #[serde(rename = "link")]
58    link: Option<OaLink>,
59}
60
61/// Link element in OA record
62#[derive(Debug, Deserialize)]
63struct OaLink {
64    #[serde(rename = "@format")]
65    format: Option<String>,
66    #[serde(rename = "@updated")]
67    updated: Option<String>,
68    #[serde(rename = "@href")]
69    href: Option<String>,
70}
71
72// ================================================================================================
73// Public API
74// ================================================================================================
75
76/// OA API base URL
77const OA_API_BASE_URL: &str = "https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi";
78
79/// Build the OA API URL for a given PMC ID
80pub fn build_oa_api_url(pmcid: &str) -> Result<String> {
81    let pmc_id = PmcId::parse(pmcid)?;
82    Ok(format!("{}?id={}", OA_API_BASE_URL, pmc_id.as_str()))
83}
84
85/// Parse OA API XML response
86///
87/// # Arguments
88///
89/// * `xml` - Raw XML response from OA API
90/// * `pmcid` - PMC ID for error reporting
91///
92/// # Returns
93///
94/// Returns `Result<OaSubsetInfo>` containing detailed information about OA availability
95pub fn parse_oa_response(xml: &str, pmcid: &str) -> Result<OaSubsetInfo> {
96    let oa_response: OaResponse = from_str(xml).map_err(|e| {
97        debug!(pmcid = %pmcid, error = %e, "Failed to parse OA API response");
98        ParseError::XmlError(format!("Failed to parse OA API response: {e}"))
99    })?;
100
101    // Check for error response
102    if let Some(error) = oa_response.error {
103        return Ok(OaSubsetInfo::not_available(
104            pmcid.to_string(),
105            error.code.unwrap_or_else(|| "unknown".to_string()),
106            error.message,
107        ));
108    }
109
110    // Check for records
111    if let Some(records) = oa_response.records
112        && let Some(record) = records.record.into_iter().next()
113    {
114        let mut info = OaSubsetInfo::available(pmcid.to_string());
115
116        info.citation = record.citation;
117        info.license = record.license;
118        info.retracted = record.retracted.is_some_and(|r| r == "yes");
119
120        if let Some(link) = record.link {
121            info.download_format = link.format;
122            info.updated = link.updated;
123            info.download_link = link.href;
124        }
125
126        return Ok(info);
127    }
128
129    // No error and no records - unexpected format
130    debug!(pmcid = %pmcid, "OA API response has no error and no records");
131    Ok(OaSubsetInfo::not_available(
132        pmcid.to_string(),
133        "parseError".to_string(),
134        "OA API response has no error and no records".to_string(),
135    ))
136}
137
138// ================================================================================================
139// OA Subset Info (public type)
140// ================================================================================================
141
142/// Information about OA (Open Access) subset availability for a PMC article
143///
144/// The OA subset contains articles with programmatic access to full-text XML.
145/// Not all PMC articles are in the OA subset - some publishers restrict programmatic access
146/// even though the article may be viewable on the PMC website.
147#[derive(Debug, Serialize, Deserialize, Clone)]
148pub struct OaSubsetInfo {
149    /// PMC ID (e.g., "PMC7906746")
150    pub pmcid: String,
151    /// Whether the article is in the OA subset
152    pub is_oa_subset: bool,
153    /// Citation string (if available)
154    pub citation: Option<String>,
155    /// License type (if available)
156    pub license: Option<String>,
157    /// Whether the article is retracted
158    pub retracted: bool,
159    /// Download link for tar.gz package (if available)
160    pub download_link: Option<String>,
161    /// Format of the download (e.g., "tgz", "pdf")
162    pub download_format: Option<String>,
163    /// Last updated timestamp for the download
164    pub updated: Option<String>,
165    /// Error code if not in OA subset
166    pub error_code: Option<String>,
167    /// Error message if not in OA subset
168    pub error_message: Option<String>,
169}
170
171impl OaSubsetInfo {
172    /// Create a new OaSubsetInfo for an article in the OA subset
173    pub fn available(pmcid: String) -> Self {
174        Self {
175            pmcid,
176            is_oa_subset: true,
177            citation: None,
178            license: None,
179            retracted: false,
180            download_link: None,
181            download_format: None,
182            updated: None,
183            error_code: None,
184            error_message: None,
185        }
186    }
187
188    /// Create a new OaSubsetInfo for an article NOT in the OA subset
189    pub fn not_available(pmcid: String, error_code: String, error_message: String) -> Self {
190        Self {
191            pmcid,
192            is_oa_subset: false,
193            citation: None,
194            license: None,
195            retracted: false,
196            download_link: None,
197            download_format: None,
198            updated: None,
199            error_code: Some(error_code),
200            error_message: Some(error_message),
201        }
202    }
203}
204
205#[cfg(test)]
206mod tests {
207    use super::*;
208
209    #[test]
210    fn test_build_oa_api_url() {
211        let url = build_oa_api_url("PMC7906746").unwrap();
212        assert_eq!(
213            url,
214            "https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?id=PMC7906746"
215        );
216
217        let url = build_oa_api_url("7906746").unwrap();
218        assert_eq!(
219            url,
220            "https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?id=PMC7906746"
221        );
222    }
223
224    #[test]
225    fn test_parse_oa_response_not_open_access() {
226        let xml = r#"<OA><responseDate>2026-01-02 10:45:24</responseDate><request>https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?id=PMC8550608</request><error code="idIsNotOpenAccess">identifier 'PMC8550608' is not Open Access</error></OA>"#;
227
228        let result = parse_oa_response(xml, "PMC8550608").unwrap();
229
230        assert!(!result.is_oa_subset);
231        assert_eq!(result.pmcid, "PMC8550608");
232        assert_eq!(result.error_code, Some("idIsNotOpenAccess".to_string()));
233        assert!(
234            result
235                .error_message
236                .as_ref()
237                .unwrap()
238                .contains("is not Open Access")
239        );
240        assert!(result.download_link.is_none());
241    }
242
243    #[test]
244    fn test_parse_oa_response_open_access() {
245        let xml = r#"<OA><responseDate>2026-01-02 10:45:39</responseDate><request id="PMC7906746">https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?id=PMC7906746</request><records returned-count="1" total-count="1"><record id="PMC7906746" citation="Lancet. 2021 Jan 27 6-12 February; 397(10273):452-455" license="none" retracted="no"><link format="tgz" updated="2022-12-16 07:10:15" href="ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/f1/69/PMC7906746.tar.gz" /></record></records></OA>"#;
246
247        let result = parse_oa_response(xml, "PMC7906746").unwrap();
248
249        assert!(result.is_oa_subset);
250        assert_eq!(result.pmcid, "PMC7906746");
251        assert_eq!(
252            result.citation,
253            Some("Lancet. 2021 Jan 27 6-12 February; 397(10273):452-455".to_string())
254        );
255        assert_eq!(result.license, Some("none".to_string()));
256        assert!(!result.retracted);
257        assert_eq!(result.download_format, Some("tgz".to_string()));
258        assert_eq!(result.updated, Some("2022-12-16 07:10:15".to_string()));
259        assert_eq!(
260            result.download_link,
261            Some(
262                "ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/f1/69/PMC7906746.tar.gz".to_string()
263            )
264        );
265        assert!(result.error_code.is_none());
266    }
267
268    #[test]
269    fn test_parse_oa_response_retracted() {
270        let xml = r#"<OA><records><record id="PMC1234567" citation="Test" license="cc-by" retracted="yes"><link format="tgz" href="ftp://test.com/file.tar.gz" /></record></records></OA>"#;
271
272        let result = parse_oa_response(xml, "PMC1234567").unwrap();
273
274        assert!(result.is_oa_subset);
275        assert!(result.retracted);
276        assert_eq!(result.license, Some("cc-by".to_string()));
277    }
278}