pubmed_client/pmc/
client.rs

1use std::time::Duration;
2
3use crate::cache::{PmcCache, create_cache};
4use crate::common::{PmcId, PubMedId};
5use crate::config::ClientConfig;
6use crate::error::{ParseError, PubMedError, Result};
7use crate::pmc::extracted::ExtractedFigure;
8use crate::pmc::oa_api;
9use crate::pmc::oa_api::OaSubsetInfo;
10use crate::pmc::parser::parse_pmc_xml;
11use crate::rate_limit::RateLimiter;
12use crate::retry::with_retry;
13use pubmed_parser::pmc::PmcArticle;
14use reqwest::{Client, Response};
15use tracing::{debug, info};
16
17#[cfg(not(target_arch = "wasm32"))]
18use {crate::pmc::tar::PmcTarClient, std::path::Path};
19
20/// Client for interacting with PMC (PubMed Central) API
21#[derive(Clone)]
22pub struct PmcClient {
23    client: Client,
24    base_url: String,
25    rate_limiter: RateLimiter,
26    config: ClientConfig,
27    #[cfg(not(target_arch = "wasm32"))]
28    tar_client: PmcTarClient,
29    cache: Option<PmcCache>,
30}
31
32impl PmcClient {
33    /// Create a new PMC client with default configuration
34    ///
35    /// Uses default NCBI rate limiting (3 requests/second) and no API key.
36    /// For production use, consider using `with_config()` to set an API key.
37    ///
38    /// # Example
39    ///
40    /// ```
41    /// use pubmed_client::PmcClient;
42    ///
43    /// let client = PmcClient::new();
44    /// ```
45    pub fn new() -> Self {
46        let config = ClientConfig::new();
47        Self::with_config(config)
48    }
49
50    pub fn get_pmc_config(&self) -> &ClientConfig {
51        &self.config
52    }
53
54    #[cfg(not(target_arch = "wasm32"))]
55    pub fn get_tar_client_config(&self) -> &ClientConfig {
56        &self.tar_client.config
57    }
58
59    /// Create a new PMC client with custom configuration
60    ///
61    /// # Arguments
62    ///
63    /// * `config` - Client configuration including rate limits, API key, etc.
64    ///
65    /// # Example
66    ///
67    /// ```
68    /// use pubmed_client::{PmcClient, ClientConfig};
69    ///
70    /// let config = ClientConfig::new()
71    ///     .with_api_key("your_api_key_here")
72    ///     .with_email("researcher@university.edu");
73    ///
74    /// let client = PmcClient::with_config(config);
75    /// ```
76    pub fn with_config(config: ClientConfig) -> Self {
77        let rate_limiter = config.create_rate_limiter();
78        let base_url = config.effective_base_url().to_string();
79
80        let client = {
81            #[cfg(not(target_arch = "wasm32"))]
82            {
83                Client::builder()
84                    .user_agent(config.effective_user_agent())
85                    .timeout(Duration::from_secs(config.timeout.as_secs()))
86                    .build()
87                    .expect("Failed to create HTTP client")
88            }
89
90            #[cfg(target_arch = "wasm32")]
91            {
92                Client::builder()
93                    .user_agent(config.effective_user_agent())
94                    .build()
95                    .expect("Failed to create HTTP client")
96            }
97        };
98
99        let cache = config.cache_config.as_ref().map(create_cache);
100
101        Self {
102            client,
103            base_url,
104            rate_limiter,
105            #[cfg(not(target_arch = "wasm32"))]
106            tar_client: PmcTarClient::new(config.clone()),
107            cache,
108            config,
109        }
110    }
111
112    /// Create a new PMC client with custom HTTP client and default configuration
113    ///
114    /// # Arguments
115    ///
116    /// * `client` - Custom reqwest client with specific configuration
117    ///
118    /// # Example
119    ///
120    /// ```
121    /// use pubmed_client::PmcClient;
122    /// use reqwest::Client;
123    /// use std::time::Duration;
124    ///
125    /// let http_client = Client::builder()
126    ///     .timeout(Duration::from_secs(30))
127    ///     .build()
128    ///     .unwrap();
129    ///
130    /// let client = PmcClient::with_client(http_client);
131    /// ```
132    pub fn with_client(client: Client) -> Self {
133        let config = ClientConfig::new();
134        let rate_limiter = config.create_rate_limiter();
135        let base_url = config.effective_base_url().to_string();
136
137        Self {
138            client,
139            base_url,
140            rate_limiter,
141            #[cfg(not(target_arch = "wasm32"))]
142            tar_client: PmcTarClient::new(config.clone()),
143            cache: None,
144            config,
145        }
146    }
147
148    /// Set a custom base URL for the PMC API
149    ///
150    /// # Arguments
151    ///
152    /// * `base_url` - The base URL for the PMC API
153    pub fn with_base_url(mut self, base_url: String) -> Self {
154        self.base_url = base_url;
155        self
156    }
157
158    /// Fetch full text from PMC using PMCID
159    ///
160    /// # Arguments
161    ///
162    /// * `pmcid` - PMC ID (with or without "PMC" prefix)
163    ///
164    /// # Returns
165    ///
166    /// Returns a `Result<PmcArticle>` containing the structured full text
167    ///
168    /// # Errors
169    ///
170    /// * `ParseError::PmcNotAvailable` - If PMC full text is not available
171    /// * `PubMedError::RequestError` - If the HTTP request fails
172    /// * `ParseError::XmlError` - If XML parsing fails
173    ///
174    /// # Example
175    ///
176    /// ```no_run
177    /// use pubmed_client::PmcClient;
178    ///
179    /// #[tokio::main]
180    /// async fn main() -> Result<(), Box<dyn std::error::Error>> {
181    ///     let client = PmcClient::new();
182    ///     let full_text = client.fetch_full_text("PMC7906746").await?;
183    ///     println!("Title: {}", full_text.title);
184    ///     println!("Sections: {}", full_text.sections.len());
185    ///     Ok(())
186    /// }
187    /// ```
188    pub async fn fetch_full_text(&self, pmcid: &str) -> Result<PmcArticle> {
189        let normalized_pmcid = self.normalize_pmcid(pmcid);
190        let cache_key = format!("pmc:{}", normalized_pmcid);
191
192        // Check cache first if available
193        if let Some(cache) = &self.cache
194            && let Some(cached) = cache.get(&cache_key).await
195        {
196            info!(pmcid = %normalized_pmcid, "Cache hit for PMC full text");
197            return Ok(cached);
198        }
199
200        // Fetch from API if not cached
201        let xml_content = self.fetch_xml(pmcid).await?;
202        let full_text = parse_pmc_xml(&xml_content, &normalized_pmcid)?;
203
204        // Store in cache if available
205        if let Some(cache) = &self.cache {
206            cache.insert(cache_key, full_text.clone()).await;
207        }
208
209        Ok(full_text)
210    }
211
212    /// Fetch raw XML content from PMC
213    ///
214    /// # Arguments
215    ///
216    /// * `pmcid` - PMC ID (with or without "PMC" prefix)
217    ///
218    /// # Returns
219    ///
220    /// Returns a `Result<String>` containing the raw XML content
221    pub async fn fetch_xml(&self, pmcid: &str) -> Result<String> {
222        // Validate and parse PMC ID
223        let pmc_id = PmcId::parse(pmcid)?;
224        let normalized_pmcid = pmc_id.as_str();
225        let numeric_part = pmc_id.numeric_part();
226
227        // Build URL with API parameters
228        let mut url = format!(
229            "{}/efetch.fcgi?db=pmc&id=PMC{numeric_part}&retmode=xml",
230            self.base_url
231        );
232
233        // Add API parameters (API key, email, tool)
234        let api_params = self.config.build_api_params();
235        for (key, value) in api_params {
236            url.push('&');
237            url.push_str(&key);
238            url.push('=');
239            url.push_str(&urlencoding::encode(&value));
240        }
241
242        let response = self.make_request(&url).await?;
243
244        if !response.status().is_success() {
245            return Err(PubMedError::ApiError {
246                status: response.status().as_u16(),
247                message: response
248                    .status()
249                    .canonical_reason()
250                    .unwrap_or("Unknown error")
251                    .to_string(),
252            });
253        }
254
255        let xml_content = response.text().await?;
256
257        // Check if the response contains an error
258        if xml_content.contains("<ERROR>") {
259            return Err(ParseError::PmcNotAvailable {
260                id: normalized_pmcid,
261            }
262            .into());
263        }
264
265        Ok(xml_content)
266    }
267
268    /// Check if PMC full text is available for a given PMID
269    ///
270    /// # Arguments
271    ///
272    /// * `pmid` - PubMed ID
273    ///
274    /// # Returns
275    ///
276    /// Returns `Result<Option<String>>` containing the PMCID if available
277    ///
278    /// # Example
279    ///
280    /// ```no_run
281    /// use pubmed_client::PmcClient;
282    ///
283    /// #[tokio::main]
284    /// async fn main() -> Result<(), Box<dyn std::error::Error>> {
285    ///     let client = PmcClient::new();
286    ///     if let Some(pmcid) = client.check_pmc_availability("33515491").await? {
287    ///         println!("PMC available: {}", pmcid);
288    ///         let full_text = client.fetch_full_text(&pmcid).await?;
289    ///         println!("Title: {}", full_text.title);
290    ///     } else {
291    ///         println!("PMC not available");
292    ///     }
293    ///     Ok(())
294    /// }
295    /// ```
296    pub async fn check_pmc_availability(&self, pmid: &str) -> Result<Option<String>> {
297        // Validate and parse PMID
298        let pmid_obj = PubMedId::parse(pmid)?;
299        let pmid_value = pmid_obj.as_u32();
300
301        // Build URL with API parameters
302        let mut url = format!(
303            "{}/elink.fcgi?dbfrom=pubmed&db=pmc&id={pmid_value}&retmode=json",
304            self.base_url
305        );
306
307        // Add API parameters (API key, email, tool)
308        let api_params = self.config.build_api_params();
309        for (key, value) in api_params {
310            url.push('&');
311            url.push_str(&key);
312            url.push('=');
313            url.push_str(&urlencoding::encode(&value));
314        }
315
316        let response = self.make_request(&url).await?;
317
318        if !response.status().is_success() {
319            return Err(PubMedError::ApiError {
320                status: response.status().as_u16(),
321                message: response
322                    .status()
323                    .canonical_reason()
324                    .unwrap_or("Unknown error")
325                    .to_string(),
326            });
327        }
328
329        let link_result: serde_json::Value = response.json().await?;
330
331        // Extract PMCID from response
332        if let Some(linksets) = link_result["linksets"].as_array() {
333            for linkset in linksets {
334                if let Some(linksetdbs) = linkset["linksetdbs"].as_array() {
335                    for linksetdb in linksetdbs {
336                        if linksetdb["dbto"] == "pmc"
337                            && let Some(links) = linksetdb["links"].as_array()
338                            && let Some(pmcid) = links.first()
339                        {
340                            return Ok(Some(format!("PMC{pmcid}")));
341                        }
342                    }
343                }
344            }
345        }
346        Ok(None)
347    }
348
349    /// Check if a PMC article is in the OA (Open Access) subset
350    ///
351    /// The OA subset contains articles with programmatic access to full-text XML.
352    /// Some publishers restrict programmatic access even though the article may be
353    /// viewable on the PMC website.
354    ///
355    /// # Arguments
356    ///
357    /// * `pmcid` - PMC ID (with or without "PMC" prefix)
358    ///
359    /// # Returns
360    ///
361    /// Returns `Result<OaSubsetInfo>` containing detailed information about OA availability
362    ///
363    /// # Example
364    ///
365    /// ```no_run
366    /// use pubmed_client::PmcClient;
367    ///
368    /// #[tokio::main]
369    /// async fn main() -> Result<(), Box<dyn std::error::Error>> {
370    ///     let client = PmcClient::new();
371    ///     let oa_info = client.is_oa_subset("PMC7906746").await?;
372    ///
373    ///     if oa_info.is_oa_subset {
374    ///         println!("Article is in OA subset");
375    ///         if let Some(link) = oa_info.download_link {
376    ///             println!("Download: {}", link);
377    ///         }
378    ///     } else {
379    ///         println!("Article is NOT in OA subset");
380    ///         if let Some(code) = oa_info.error_code {
381    ///             println!("Reason: {}", code);
382    ///         }
383    ///     }
384    ///     Ok(())
385    /// }
386    /// ```
387    pub async fn is_oa_subset(&self, pmcid: &str) -> Result<OaSubsetInfo> {
388        let url = oa_api::build_oa_api_url(pmcid)?;
389
390        let response = self.make_request(&url).await?;
391
392        if !response.status().is_success() {
393            return Err(PubMedError::ApiError {
394                status: response.status().as_u16(),
395                message: response
396                    .status()
397                    .canonical_reason()
398                    .unwrap_or("Unknown error")
399                    .to_string(),
400            });
401        }
402
403        let xml_content = response.text().await?;
404
405        // Parse the OA API XML response
406        Ok(oa_api::parse_oa_response(&xml_content, pmcid)?)
407    }
408
409    /// Download and extract tar.gz file for a PMC article using the OA API
410    ///
411    /// # Arguments
412    ///
413    /// * `pmcid` - PMC ID (with or without "PMC" prefix)
414    /// * `output_dir` - Directory to extract the tar.gz contents to
415    ///
416    /// # Returns
417    ///
418    /// Returns a `Result<Vec<String>>` containing the list of extracted file paths
419    ///
420    /// # Errors
421    ///
422    /// * `ParseError::InvalidPmid` - If the PMCID format is invalid
423    /// * `PubMedError::RequestError` - If the HTTP request fails
424    /// * `ParseError::IoError` - If file operations fail
425    /// * `ParseError::PmcNotAvailable` - If the article is not available in OA
426    ///
427    /// # Example
428    ///
429    /// ```no_run
430    /// use pubmed_client::PmcClient;
431    /// use std::path::Path;
432    ///
433    /// #[tokio::main]
434    /// async fn main() -> Result<(), Box<dyn std::error::Error>> {
435    ///     let client = PmcClient::new();
436    ///     let output_dir = Path::new("./extracted_articles");
437    ///     let files = client.download_and_extract_tar("PMC7906746", output_dir).await?;
438    ///
439    ///     for file in files {
440    ///         println!("Extracted: {}", file);
441    ///     }
442    ///     Ok(())
443    /// }
444    /// ```
445    #[cfg(not(target_arch = "wasm32"))]
446    pub async fn download_and_extract_tar<P: AsRef<Path>>(
447        &self,
448        pmcid: &str,
449        output_dir: P,
450    ) -> Result<Vec<String>> {
451        self.tar_client
452            .download_and_extract_tar(pmcid, output_dir)
453            .await
454    }
455
456    /// Download, extract tar.gz file, and match figures with their captions from XML
457    ///
458    /// # Arguments
459    ///
460    /// * `pmcid` - PMC ID (with or without "PMC" prefix)
461    /// * `output_dir` - Directory to extract the tar.gz contents to
462    ///
463    /// # Returns
464    ///
465    /// Returns a `Result<Vec<ExtractedFigure>>` containing figures with both XML metadata and file paths
466    ///
467    /// # Errors
468    ///
469    /// * `ParseError::InvalidPmid` - If the PMCID format is invalid
470    /// * `PubMedError::RequestError` - If the HTTP request fails
471    /// * `ParseError::IoError` - If file operations fail
472    /// * `ParseError::PmcNotAvailable` - If the article is not available in OA
473    ///
474    /// # Example
475    ///
476    /// ```no_run
477    /// use pubmed_client::PmcClient;
478    /// use std::path::Path;
479    ///
480    /// #[tokio::main]
481    /// async fn main() -> Result<(), Box<dyn std::error::Error>> {
482    ///     let client = PmcClient::new();
483    ///     let output_dir = Path::new("./extracted_articles");
484    ///     let figures = client.extract_figures_with_captions("PMC7906746", output_dir).await?;
485    ///
486    ///     for figure in figures {
487    ///         println!("Figure {}: {}", figure.figure.id, figure.figure.caption);
488    ///         println!("File: {}", figure.extracted_file_path);
489    ///     }
490    ///     Ok(())
491    /// }
492    /// ```
493    #[cfg(not(target_arch = "wasm32"))]
494    pub async fn extract_figures_with_captions<P: AsRef<Path>>(
495        &self,
496        pmcid: &str,
497        output_dir: P,
498    ) -> Result<Vec<ExtractedFigure>> {
499        self.tar_client
500            .extract_figures_with_captions(pmcid, output_dir)
501            .await
502    }
503
504    /// Clear all cached PMC data
505    ///
506    /// # Example
507    ///
508    /// ```no_run
509    /// use pubmed_client::PmcClient;
510    ///
511    /// #[tokio::main]
512    /// async fn main() -> Result<(), Box<dyn std::error::Error>> {
513    ///     let client = PmcClient::new();
514    ///     client.clear_cache().await;
515    ///     Ok(())
516    /// }
517    /// ```
518    pub async fn clear_cache(&self) {
519        if let Some(cache) = &self.cache {
520            cache.clear().await;
521            info!("Cleared PMC cache");
522        }
523    }
524
525    /// Get cache statistics
526    ///
527    /// Returns the number of items in cache, or 0 if caching is disabled
528    ///
529    /// # Example
530    ///
531    /// ```
532    /// use pubmed_client::PmcClient;
533    ///
534    /// let client = PmcClient::new();
535    /// let count = client.cache_entry_count();
536    /// println!("Cache entries: {}", count);
537    /// ```
538    pub fn cache_entry_count(&self) -> u64 {
539        self.cache.as_ref().map_or(0, |cache| cache.entry_count())
540    }
541
542    /// Synchronize cache operations to ensure all pending operations are flushed
543    ///
544    /// This is useful for testing to ensure cache statistics are accurate
545    pub async fn sync_cache(&self) {
546        if let Some(cache) = &self.cache {
547            cache.sync().await;
548        }
549    }
550
551    /// Normalize PMCID format (ensure it starts with "PMC")
552    fn normalize_pmcid(&self, pmcid: &str) -> String {
553        // Use PmcId for validation and normalization
554        // If parsing fails, fall back to the old behavior for backwards compatibility
555        PmcId::parse(pmcid)
556            .map(|id| id.as_str())
557            .unwrap_or_else(|_| {
558                if pmcid.starts_with("PMC") {
559                    pmcid.to_string()
560                } else {
561                    format!("PMC{pmcid}")
562                }
563            })
564    }
565
566    /// Internal helper method for making HTTP requests with retry logic
567    async fn make_request(&self, url: &str) -> Result<Response> {
568        with_retry(
569            || async {
570                self.rate_limiter.acquire().await?;
571                debug!("Making API request to: {url}");
572                let response = self
573                    .client
574                    .get(url)
575                    .send()
576                    .await
577                    .map_err(PubMedError::from)?;
578
579                // Check if response has server error status and convert to retryable error
580                if response.status().is_server_error() || response.status().as_u16() == 429 {
581                    return Err(PubMedError::ApiError {
582                        status: response.status().as_u16(),
583                        message: response
584                            .status()
585                            .canonical_reason()
586                            .unwrap_or("Unknown error")
587                            .to_string(),
588                    });
589                }
590
591                Ok(response)
592            },
593            &self.config.retry_config,
594            "NCBI API request",
595        )
596        .await
597    }
598}
599
600impl Default for PmcClient {
601    fn default() -> Self {
602        Self::new()
603    }
604}
605
606#[cfg(test)]
607mod tests {
608    use super::*;
609
610    #[test]
611    fn test_normalize_pmcid() {
612        let client = PmcClient::new();
613
614        assert_eq!(client.normalize_pmcid("1234567"), "PMC1234567");
615        assert_eq!(client.normalize_pmcid("PMC1234567"), "PMC1234567");
616    }
617
618    #[test]
619    fn test_client_creation() {
620        let client = PmcClient::new();
621        assert!(client.base_url.contains("eutils.ncbi.nlm.nih.gov"));
622    }
623
624    #[test]
625    fn test_custom_base_url() {
626        let client = PmcClient::new().with_base_url("https://custom.api.example.com".to_string());
627        assert_eq!(client.base_url, "https://custom.api.example.com");
628    }
629}