pubmed_client/pmc/client.rs
1use std::time::Duration;
2
3use crate::cache::{PmcCache, create_cache};
4use crate::common::{PmcId, PubMedId};
5use crate::config::ClientConfig;
6use crate::error::{ParseError, PubMedError, Result};
7use crate::pmc::extracted::ExtractedFigure;
8use crate::pmc::oa_api;
9use crate::pmc::oa_api::OaSubsetInfo;
10use crate::pmc::parser::parse_pmc_xml;
11use crate::rate_limit::RateLimiter;
12use crate::retry::with_retry;
13use pubmed_parser::pmc::PmcArticle;
14use reqwest::{Client, Response};
15use tracing::{debug, info};
16
17#[cfg(not(target_arch = "wasm32"))]
18use {crate::pmc::tar::PmcTarClient, std::path::Path};
19
20/// Client for interacting with PMC (PubMed Central) API
21#[derive(Clone)]
22pub struct PmcClient {
23 client: Client,
24 base_url: String,
25 rate_limiter: RateLimiter,
26 config: ClientConfig,
27 #[cfg(not(target_arch = "wasm32"))]
28 tar_client: PmcTarClient,
29 cache: Option<PmcCache>,
30}
31
32impl PmcClient {
33 /// Create a new PMC client with default configuration
34 ///
35 /// Uses default NCBI rate limiting (3 requests/second) and no API key.
36 /// For production use, consider using `with_config()` to set an API key.
37 ///
38 /// # Example
39 ///
40 /// ```
41 /// use pubmed_client::PmcClient;
42 ///
43 /// let client = PmcClient::new();
44 /// ```
45 pub fn new() -> Self {
46 let config = ClientConfig::new();
47 Self::with_config(config)
48 }
49
50 pub fn get_pmc_config(&self) -> &ClientConfig {
51 &self.config
52 }
53
54 #[cfg(not(target_arch = "wasm32"))]
55 pub fn get_tar_client_config(&self) -> &ClientConfig {
56 &self.tar_client.config
57 }
58
59 /// Create a new PMC client with custom configuration
60 ///
61 /// # Arguments
62 ///
63 /// * `config` - Client configuration including rate limits, API key, etc.
64 ///
65 /// # Example
66 ///
67 /// ```
68 /// use pubmed_client::{PmcClient, ClientConfig};
69 ///
70 /// let config = ClientConfig::new()
71 /// .with_api_key("your_api_key_here")
72 /// .with_email("researcher@university.edu");
73 ///
74 /// let client = PmcClient::with_config(config);
75 /// ```
76 pub fn with_config(config: ClientConfig) -> Self {
77 let rate_limiter = config.create_rate_limiter();
78 let base_url = config.effective_base_url().to_string();
79
80 let client = {
81 #[cfg(not(target_arch = "wasm32"))]
82 {
83 Client::builder()
84 .user_agent(config.effective_user_agent())
85 .timeout(Duration::from_secs(config.timeout.as_secs()))
86 .build()
87 .expect("Failed to create HTTP client")
88 }
89
90 #[cfg(target_arch = "wasm32")]
91 {
92 Client::builder()
93 .user_agent(config.effective_user_agent())
94 .build()
95 .expect("Failed to create HTTP client")
96 }
97 };
98
99 let cache = config.cache_config.as_ref().map(create_cache);
100
101 Self {
102 client,
103 base_url,
104 rate_limiter,
105 #[cfg(not(target_arch = "wasm32"))]
106 tar_client: PmcTarClient::new(config.clone()),
107 cache,
108 config,
109 }
110 }
111
112 /// Create a new PMC client with custom HTTP client and default configuration
113 ///
114 /// # Arguments
115 ///
116 /// * `client` - Custom reqwest client with specific configuration
117 ///
118 /// # Example
119 ///
120 /// ```
121 /// use pubmed_client::PmcClient;
122 /// use reqwest::Client;
123 /// use std::time::Duration;
124 ///
125 /// let http_client = Client::builder()
126 /// .timeout(Duration::from_secs(30))
127 /// .build()
128 /// .unwrap();
129 ///
130 /// let client = PmcClient::with_client(http_client);
131 /// ```
132 pub fn with_client(client: Client) -> Self {
133 let config = ClientConfig::new();
134 let rate_limiter = config.create_rate_limiter();
135 let base_url = config.effective_base_url().to_string();
136
137 Self {
138 client,
139 base_url,
140 rate_limiter,
141 #[cfg(not(target_arch = "wasm32"))]
142 tar_client: PmcTarClient::new(config.clone()),
143 cache: None,
144 config,
145 }
146 }
147
148 /// Set a custom base URL for the PMC API
149 ///
150 /// # Arguments
151 ///
152 /// * `base_url` - The base URL for the PMC API
153 pub fn with_base_url(mut self, base_url: String) -> Self {
154 self.base_url = base_url;
155 self
156 }
157
158 /// Fetch full text from PMC using PMCID
159 ///
160 /// # Arguments
161 ///
162 /// * `pmcid` - PMC ID (with or without "PMC" prefix)
163 ///
164 /// # Returns
165 ///
166 /// Returns a `Result<PmcArticle>` containing the structured full text
167 ///
168 /// # Errors
169 ///
170 /// * `ParseError::PmcNotAvailable` - If PMC full text is not available
171 /// * `PubMedError::RequestError` - If the HTTP request fails
172 /// * `ParseError::XmlError` - If XML parsing fails
173 ///
174 /// # Example
175 ///
176 /// ```no_run
177 /// use pubmed_client::PmcClient;
178 ///
179 /// #[tokio::main]
180 /// async fn main() -> Result<(), Box<dyn std::error::Error>> {
181 /// let client = PmcClient::new();
182 /// let full_text = client.fetch_full_text("PMC7906746").await?;
183 /// println!("Title: {}", full_text.title);
184 /// println!("Sections: {}", full_text.sections.len());
185 /// Ok(())
186 /// }
187 /// ```
188 pub async fn fetch_full_text(&self, pmcid: &str) -> Result<PmcArticle> {
189 let normalized_pmcid = self.normalize_pmcid(pmcid);
190 let cache_key = format!("pmc:{}", normalized_pmcid);
191
192 // Check cache first if available
193 if let Some(cache) = &self.cache
194 && let Some(cached) = cache.get(&cache_key).await
195 {
196 info!(pmcid = %normalized_pmcid, "Cache hit for PMC full text");
197 return Ok(cached);
198 }
199
200 // Fetch from API if not cached
201 let xml_content = self.fetch_xml(pmcid).await?;
202 let full_text = parse_pmc_xml(&xml_content, &normalized_pmcid)?;
203
204 // Store in cache if available
205 if let Some(cache) = &self.cache {
206 cache.insert(cache_key, full_text.clone()).await;
207 }
208
209 Ok(full_text)
210 }
211
212 /// Fetch raw XML content from PMC
213 ///
214 /// # Arguments
215 ///
216 /// * `pmcid` - PMC ID (with or without "PMC" prefix)
217 ///
218 /// # Returns
219 ///
220 /// Returns a `Result<String>` containing the raw XML content
221 pub async fn fetch_xml(&self, pmcid: &str) -> Result<String> {
222 // Validate and parse PMC ID
223 let pmc_id = PmcId::parse(pmcid)?;
224 let normalized_pmcid = pmc_id.as_str();
225 let numeric_part = pmc_id.numeric_part();
226
227 // Build URL with API parameters
228 let mut url = format!(
229 "{}/efetch.fcgi?db=pmc&id=PMC{numeric_part}&retmode=xml",
230 self.base_url
231 );
232
233 // Add API parameters (API key, email, tool)
234 let api_params = self.config.build_api_params();
235 for (key, value) in api_params {
236 url.push('&');
237 url.push_str(&key);
238 url.push('=');
239 url.push_str(&urlencoding::encode(&value));
240 }
241
242 let response = self.make_request(&url).await?;
243
244 if !response.status().is_success() {
245 return Err(PubMedError::ApiError {
246 status: response.status().as_u16(),
247 message: response
248 .status()
249 .canonical_reason()
250 .unwrap_or("Unknown error")
251 .to_string(),
252 });
253 }
254
255 let xml_content = response.text().await?;
256
257 // Check if the response contains an error
258 if xml_content.contains("<ERROR>") {
259 return Err(ParseError::PmcNotAvailable {
260 id: normalized_pmcid,
261 }
262 .into());
263 }
264
265 Ok(xml_content)
266 }
267
268 /// Check if PMC full text is available for a given PMID
269 ///
270 /// # Arguments
271 ///
272 /// * `pmid` - PubMed ID
273 ///
274 /// # Returns
275 ///
276 /// Returns `Result<Option<String>>` containing the PMCID if available
277 ///
278 /// # Example
279 ///
280 /// ```no_run
281 /// use pubmed_client::PmcClient;
282 ///
283 /// #[tokio::main]
284 /// async fn main() -> Result<(), Box<dyn std::error::Error>> {
285 /// let client = PmcClient::new();
286 /// if let Some(pmcid) = client.check_pmc_availability("33515491").await? {
287 /// println!("PMC available: {}", pmcid);
288 /// let full_text = client.fetch_full_text(&pmcid).await?;
289 /// println!("Title: {}", full_text.title);
290 /// } else {
291 /// println!("PMC not available");
292 /// }
293 /// Ok(())
294 /// }
295 /// ```
296 pub async fn check_pmc_availability(&self, pmid: &str) -> Result<Option<String>> {
297 // Validate and parse PMID
298 let pmid_obj = PubMedId::parse(pmid)?;
299 let pmid_value = pmid_obj.as_u32();
300
301 // Build URL with API parameters
302 let mut url = format!(
303 "{}/elink.fcgi?dbfrom=pubmed&db=pmc&id={pmid_value}&retmode=json",
304 self.base_url
305 );
306
307 // Add API parameters (API key, email, tool)
308 let api_params = self.config.build_api_params();
309 for (key, value) in api_params {
310 url.push('&');
311 url.push_str(&key);
312 url.push('=');
313 url.push_str(&urlencoding::encode(&value));
314 }
315
316 let response = self.make_request(&url).await?;
317
318 if !response.status().is_success() {
319 return Err(PubMedError::ApiError {
320 status: response.status().as_u16(),
321 message: response
322 .status()
323 .canonical_reason()
324 .unwrap_or("Unknown error")
325 .to_string(),
326 });
327 }
328
329 let link_result: serde_json::Value = response.json().await?;
330
331 // Extract PMCID from response
332 if let Some(linksets) = link_result["linksets"].as_array() {
333 for linkset in linksets {
334 if let Some(linksetdbs) = linkset["linksetdbs"].as_array() {
335 for linksetdb in linksetdbs {
336 if linksetdb["dbto"] == "pmc"
337 && let Some(links) = linksetdb["links"].as_array()
338 && let Some(pmcid) = links.first()
339 {
340 return Ok(Some(format!("PMC{pmcid}")));
341 }
342 }
343 }
344 }
345 }
346 Ok(None)
347 }
348
349 /// Check if a PMC article is in the OA (Open Access) subset
350 ///
351 /// The OA subset contains articles with programmatic access to full-text XML.
352 /// Some publishers restrict programmatic access even though the article may be
353 /// viewable on the PMC website.
354 ///
355 /// # Arguments
356 ///
357 /// * `pmcid` - PMC ID (with or without "PMC" prefix)
358 ///
359 /// # Returns
360 ///
361 /// Returns `Result<OaSubsetInfo>` containing detailed information about OA availability
362 ///
363 /// # Example
364 ///
365 /// ```no_run
366 /// use pubmed_client::PmcClient;
367 ///
368 /// #[tokio::main]
369 /// async fn main() -> Result<(), Box<dyn std::error::Error>> {
370 /// let client = PmcClient::new();
371 /// let oa_info = client.is_oa_subset("PMC7906746").await?;
372 ///
373 /// if oa_info.is_oa_subset {
374 /// println!("Article is in OA subset");
375 /// if let Some(link) = oa_info.download_link {
376 /// println!("Download: {}", link);
377 /// }
378 /// } else {
379 /// println!("Article is NOT in OA subset");
380 /// if let Some(code) = oa_info.error_code {
381 /// println!("Reason: {}", code);
382 /// }
383 /// }
384 /// Ok(())
385 /// }
386 /// ```
387 pub async fn is_oa_subset(&self, pmcid: &str) -> Result<OaSubsetInfo> {
388 let url = oa_api::build_oa_api_url(pmcid)?;
389
390 let response = self.make_request(&url).await?;
391
392 if !response.status().is_success() {
393 return Err(PubMedError::ApiError {
394 status: response.status().as_u16(),
395 message: response
396 .status()
397 .canonical_reason()
398 .unwrap_or("Unknown error")
399 .to_string(),
400 });
401 }
402
403 let xml_content = response.text().await?;
404
405 // Parse the OA API XML response
406 Ok(oa_api::parse_oa_response(&xml_content, pmcid)?)
407 }
408
409 /// Download and extract tar.gz file for a PMC article using the OA API
410 ///
411 /// # Arguments
412 ///
413 /// * `pmcid` - PMC ID (with or without "PMC" prefix)
414 /// * `output_dir` - Directory to extract the tar.gz contents to
415 ///
416 /// # Returns
417 ///
418 /// Returns a `Result<Vec<String>>` containing the list of extracted file paths
419 ///
420 /// # Errors
421 ///
422 /// * `ParseError::InvalidPmid` - If the PMCID format is invalid
423 /// * `PubMedError::RequestError` - If the HTTP request fails
424 /// * `ParseError::IoError` - If file operations fail
425 /// * `ParseError::PmcNotAvailable` - If the article is not available in OA
426 ///
427 /// # Example
428 ///
429 /// ```no_run
430 /// use pubmed_client::PmcClient;
431 /// use std::path::Path;
432 ///
433 /// #[tokio::main]
434 /// async fn main() -> Result<(), Box<dyn std::error::Error>> {
435 /// let client = PmcClient::new();
436 /// let output_dir = Path::new("./extracted_articles");
437 /// let files = client.download_and_extract_tar("PMC7906746", output_dir).await?;
438 ///
439 /// for file in files {
440 /// println!("Extracted: {}", file);
441 /// }
442 /// Ok(())
443 /// }
444 /// ```
445 #[cfg(not(target_arch = "wasm32"))]
446 pub async fn download_and_extract_tar<P: AsRef<Path>>(
447 &self,
448 pmcid: &str,
449 output_dir: P,
450 ) -> Result<Vec<String>> {
451 self.tar_client
452 .download_and_extract_tar(pmcid, output_dir)
453 .await
454 }
455
456 /// Download, extract tar.gz file, and match figures with their captions from XML
457 ///
458 /// # Arguments
459 ///
460 /// * `pmcid` - PMC ID (with or without "PMC" prefix)
461 /// * `output_dir` - Directory to extract the tar.gz contents to
462 ///
463 /// # Returns
464 ///
465 /// Returns a `Result<Vec<ExtractedFigure>>` containing figures with both XML metadata and file paths
466 ///
467 /// # Errors
468 ///
469 /// * `ParseError::InvalidPmid` - If the PMCID format is invalid
470 /// * `PubMedError::RequestError` - If the HTTP request fails
471 /// * `ParseError::IoError` - If file operations fail
472 /// * `ParseError::PmcNotAvailable` - If the article is not available in OA
473 ///
474 /// # Example
475 ///
476 /// ```no_run
477 /// use pubmed_client::PmcClient;
478 /// use std::path::Path;
479 ///
480 /// #[tokio::main]
481 /// async fn main() -> Result<(), Box<dyn std::error::Error>> {
482 /// let client = PmcClient::new();
483 /// let output_dir = Path::new("./extracted_articles");
484 /// let figures = client.extract_figures_with_captions("PMC7906746", output_dir).await?;
485 ///
486 /// for figure in figures {
487 /// println!("Figure {}: {}", figure.figure.id, figure.figure.caption);
488 /// println!("File: {}", figure.extracted_file_path);
489 /// }
490 /// Ok(())
491 /// }
492 /// ```
493 #[cfg(not(target_arch = "wasm32"))]
494 pub async fn extract_figures_with_captions<P: AsRef<Path>>(
495 &self,
496 pmcid: &str,
497 output_dir: P,
498 ) -> Result<Vec<ExtractedFigure>> {
499 self.tar_client
500 .extract_figures_with_captions(pmcid, output_dir)
501 .await
502 }
503
504 /// Clear all cached PMC data
505 ///
506 /// # Example
507 ///
508 /// ```no_run
509 /// use pubmed_client::PmcClient;
510 ///
511 /// #[tokio::main]
512 /// async fn main() -> Result<(), Box<dyn std::error::Error>> {
513 /// let client = PmcClient::new();
514 /// client.clear_cache().await;
515 /// Ok(())
516 /// }
517 /// ```
518 pub async fn clear_cache(&self) {
519 if let Some(cache) = &self.cache {
520 cache.clear().await;
521 info!("Cleared PMC cache");
522 }
523 }
524
525 /// Get cache statistics
526 ///
527 /// Returns the number of items in cache, or 0 if caching is disabled
528 ///
529 /// # Example
530 ///
531 /// ```
532 /// use pubmed_client::PmcClient;
533 ///
534 /// let client = PmcClient::new();
535 /// let count = client.cache_entry_count();
536 /// println!("Cache entries: {}", count);
537 /// ```
538 pub fn cache_entry_count(&self) -> u64 {
539 self.cache.as_ref().map_or(0, |cache| cache.entry_count())
540 }
541
542 /// Synchronize cache operations to ensure all pending operations are flushed
543 ///
544 /// This is useful for testing to ensure cache statistics are accurate
545 pub async fn sync_cache(&self) {
546 if let Some(cache) = &self.cache {
547 cache.sync().await;
548 }
549 }
550
551 /// Normalize PMCID format (ensure it starts with "PMC")
552 fn normalize_pmcid(&self, pmcid: &str) -> String {
553 // Use PmcId for validation and normalization
554 // If parsing fails, fall back to the old behavior for backwards compatibility
555 PmcId::parse(pmcid)
556 .map(|id| id.as_str())
557 .unwrap_or_else(|_| {
558 if pmcid.starts_with("PMC") {
559 pmcid.to_string()
560 } else {
561 format!("PMC{pmcid}")
562 }
563 })
564 }
565
566 /// Internal helper method for making HTTP requests with retry logic
567 async fn make_request(&self, url: &str) -> Result<Response> {
568 with_retry(
569 || async {
570 self.rate_limiter.acquire().await?;
571 debug!("Making API request to: {url}");
572 let response = self
573 .client
574 .get(url)
575 .send()
576 .await
577 .map_err(PubMedError::from)?;
578
579 // Check if response has server error status and convert to retryable error
580 if response.status().is_server_error() || response.status().as_u16() == 429 {
581 return Err(PubMedError::ApiError {
582 status: response.status().as_u16(),
583 message: response
584 .status()
585 .canonical_reason()
586 .unwrap_or("Unknown error")
587 .to_string(),
588 });
589 }
590
591 Ok(response)
592 },
593 &self.config.retry_config,
594 "NCBI API request",
595 )
596 .await
597 }
598}
599
600impl Default for PmcClient {
601 fn default() -> Self {
602 Self::new()
603 }
604}
605
606#[cfg(test)]
607mod tests {
608 use super::*;
609
610 #[test]
611 fn test_normalize_pmcid() {
612 let client = PmcClient::new();
613
614 assert_eq!(client.normalize_pmcid("1234567"), "PMC1234567");
615 assert_eq!(client.normalize_pmcid("PMC1234567"), "PMC1234567");
616 }
617
618 #[test]
619 fn test_client_creation() {
620 let client = PmcClient::new();
621 assert!(client.base_url.contains("eutils.ncbi.nlm.nih.gov"));
622 }
623
624 #[test]
625 fn test_custom_base_url() {
626 let client = PmcClient::new().with_base_url("https://custom.api.example.com".to_string());
627 assert_eq!(client.base_url, "https://custom.api.example.com");
628 }
629}