pubmed_client/pmc/
tar.rs

1use std::{path::Path, str, time::Duration};
2
3use crate::common::PmcId;
4use crate::config::ClientConfig;
5use crate::error::{ParseError, PubMedError, Result};
6use crate::pmc::extracted::ExtractedFigure;
7use crate::pmc::parser::parse_pmc_xml;
8use crate::rate_limit::RateLimiter;
9use crate::retry::with_retry;
10use pubmed_parser::pmc::{Figure, PmcArticle, Section};
11use reqwest::{Client, Response};
12use tracing::debug;
13
14#[cfg(not(target_arch = "wasm32"))]
15use {
16    flate2::read::GzDecoder,
17    futures_util::StreamExt,
18    std::{fs, fs::File},
19    tar::Archive,
20    tokio::{fs as tokio_fs, io::AsyncWriteExt, task},
21};
22
23/// TAR extraction client for PMC Open Access articles
24#[derive(Clone)]
25pub struct PmcTarClient {
26    client: Client,
27    rate_limiter: RateLimiter,
28    pub(crate) config: ClientConfig,
29}
30
31impl PmcTarClient {
32    /// Create a new PMC TAR client with configuration
33    pub fn new(config: ClientConfig) -> Self {
34        let rate_limiter = config.create_rate_limiter();
35
36        let client = {
37            #[cfg(not(target_arch = "wasm32"))]
38            {
39                Client::builder()
40                    .user_agent(config.effective_user_agent())
41                    .timeout(Duration::from_secs(config.timeout.as_secs()))
42                    .build()
43                    .expect("Failed to create HTTP client")
44            }
45
46            #[cfg(target_arch = "wasm32")]
47            {
48                Client::builder()
49                    .user_agent(config.effective_user_agent())
50                    .build()
51                    .expect("Failed to create HTTP client")
52            }
53        };
54
55        Self {
56            client,
57            rate_limiter,
58            config,
59        }
60    }
61
62    /// Download and extract tar.gz file for a PMC article using the OA API
63    ///
64    /// # Arguments
65    ///
66    /// * `pmcid` - PMC ID (with or without "PMC" prefix)
67    /// * `output_dir` - Directory to extract the tar.gz contents to
68    ///
69    /// # Returns
70    ///
71    /// Returns a `Result<Vec<String>>` containing the list of extracted file paths
72    ///
73    /// # Errors
74    ///
75    /// * `ParseError::InvalidPmid` - If the PMCID format is invalid
76    /// * `PubMedError::RequestError` - If the HTTP request fails
77    /// * `ParseError::IoError` - If file operations fail
78    /// * `ParseError::PmcNotAvailable` - If the article is not available in OA
79    ///
80    /// # Example
81    ///
82    /// ```no_run
83    /// use pubmed_client::pmc::tar::PmcTarClient;
84    /// use pubmed_client::ClientConfig;
85    /// use std::path::Path;
86    ///
87    /// #[tokio::main]
88    /// async fn main() -> Result<(), Box<dyn std::error::Error>> {
89    ///     let config = ClientConfig::new();
90    ///     let client = PmcTarClient::new(config);
91    ///     let output_dir = Path::new("./extracted_articles");
92    ///     let files = client.download_and_extract_tar("PMC7906746", output_dir).await?;
93    ///
94    ///     for file in files {
95    ///         println!("Extracted: {}", file);
96    ///     }
97    ///     Ok(())
98    /// }
99    /// ```
100    #[cfg(not(target_arch = "wasm32"))]
101    pub async fn download_and_extract_tar<P: AsRef<Path>>(
102        &self,
103        pmcid: &str,
104        output_dir: P,
105    ) -> Result<Vec<String>> {
106        // Validate and parse PMC ID
107        let pmc_id = PmcId::parse(pmcid)?;
108        let normalized_pmcid = pmc_id.as_str();
109
110        // Create output directory early (before any potential failures)
111        let output_path = output_dir.as_ref();
112        tokio_fs::create_dir_all(output_path)
113            .await
114            .map_err(|e| ParseError::IoError {
115                message: format!("Failed to create output directory: {}", e),
116            })?;
117
118        // Build OA API URL
119        let mut url = format!(
120            "https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?id={}&format=tgz",
121            normalized_pmcid
122        );
123
124        // Add API parameters if available
125        let api_params = self.config.build_api_params();
126        for (key, value) in api_params {
127            url.push('&');
128            url.push_str(&key);
129            url.push('=');
130            url.push_str(&urlencoding::encode(&value));
131        }
132
133        debug!("Downloading tar.gz from OA API: {}", url);
134
135        // Download the OA API response
136        let response = self.make_request(&url).await?;
137
138        if !response.status().is_success() {
139            return Err(PubMedError::ApiError {
140                status: response.status().as_u16(),
141                message: response
142                    .status()
143                    .canonical_reason()
144                    .unwrap_or("Unknown error")
145                    .to_string(),
146            });
147        }
148
149        // Check if the response is XML (OA API response with download link)
150        let content_type = response
151            .headers()
152            .get("content-type")
153            .and_then(|v| v.to_str().ok())
154            .unwrap_or("");
155
156        debug!("OA API response content-type: {}", content_type);
157
158        let download_url =
159            if content_type.contains("text/xml") || content_type.contains("application/xml") {
160                // Parse XML to extract the actual download URL
161                let xml_content = response.text().await?;
162                debug!("OA API returned XML, parsing for download URL");
163                let parsed_url = self.parse_oa_response(&xml_content, pmcid)?;
164                // Convert FTP URLs to HTTPS for HTTP client compatibility
165                if parsed_url.starts_with("ftp://ftp.ncbi.nlm.nih.gov/") {
166                    parsed_url.replace(
167                        "ftp://ftp.ncbi.nlm.nih.gov/",
168                        "https://ftp.ncbi.nlm.nih.gov/",
169                    )
170                } else {
171                    parsed_url
172                }
173            } else if content_type.contains("application/x-gzip")
174                || content_type.contains("application/gzip")
175            {
176                // Direct tar.gz download - use the original URL
177                url.clone()
178            } else {
179                // Check if it's an error response
180                let error_text = response.text().await?;
181                if error_text.contains("error") || error_text.contains("Error") {
182                    return Err(ParseError::PmcNotAvailable {
183                        id: pmcid.to_string(),
184                    }
185                    .into());
186                }
187                // If we get here, it's likely still an error but we consumed the response
188                return Err(ParseError::PmcNotAvailable {
189                    id: pmcid.to_string(),
190                }
191                .into());
192            };
193
194        // Now download the actual tar.gz file
195        let tar_response = self.make_request(&download_url).await?;
196
197        if !tar_response.status().is_success() {
198            return Err(PubMedError::ApiError {
199                status: tar_response.status().as_u16(),
200                message: tar_response
201                    .status()
202                    .canonical_reason()
203                    .unwrap_or("Unknown error")
204                    .to_string(),
205            });
206        }
207
208        // Create output directory if it doesn't exist
209        let output_path = output_dir.as_ref();
210        tokio_fs::create_dir_all(output_path)
211            .await
212            .map_err(|e| ParseError::IoError {
213                message: format!("Failed to create output directory: {}", e),
214            })?;
215
216        // Stream the response to a temporary file
217        let temp_file_path = output_path.join(format!("{}.tar.gz", normalized_pmcid));
218        let mut temp_file =
219            tokio_fs::File::create(&temp_file_path)
220                .await
221                .map_err(|e| ParseError::IoError {
222                    message: format!("Failed to create temporary file: {}", e),
223                })?;
224
225        let mut stream = tar_response.bytes_stream();
226        while let Some(chunk) = stream.next().await {
227            let chunk = chunk.map_err(PubMedError::from)?;
228            temp_file
229                .write_all(&chunk)
230                .await
231                .map_err(|e| ParseError::IoError {
232                    message: format!("Failed to write to temporary file: {}", e),
233                })?;
234        }
235
236        temp_file.flush().await.map_err(|e| ParseError::IoError {
237            message: format!("Failed to flush temporary file: {}", e),
238        })?;
239
240        debug!("Downloaded tar.gz to: {}", temp_file_path.display());
241
242        // Extract the tar.gz file
243        let extracted_files = self
244            .extract_tar_gz(&temp_file_path, &output_path.to_path_buf())
245            .await?;
246
247        // Clean up temporary file
248        tokio_fs::remove_file(&temp_file_path)
249            .await
250            .map_err(|e| ParseError::IoError {
251                message: format!("Failed to remove temporary file: {}", e),
252            })?;
253
254        Ok(extracted_files)
255    }
256
257    /// Download, extract tar.gz file, and match figures with their captions from XML
258    ///
259    /// # Arguments
260    ///
261    /// * `pmcid` - PMC ID (with or without "PMC" prefix)
262    /// * `output_dir` - Directory to extract the tar.gz contents to
263    ///
264    /// # Returns
265    ///
266    /// Returns a `Result<Vec<ExtractedFigure>>` containing figures with both XML metadata and file paths
267    ///
268    /// # Errors
269    ///
270    /// * `ParseError::InvalidPmid` - If the PMCID format is invalid
271    /// * `PubMedError::RequestError` - If the HTTP request fails
272    /// * `ParseError::IoError` - If file operations fail
273    /// * `ParseError::PmcNotAvailable` - If the article is not available in OA
274    ///
275    /// # Example
276    ///
277    /// ```no_run
278    /// use pubmed_client::pmc::tar::PmcTarClient;
279    /// use pubmed_client::ClientConfig;
280    /// use std::path::Path;
281    ///
282    /// #[tokio::main]
283    /// async fn main() -> Result<(), Box<dyn std::error::Error>> {
284    ///     let config = ClientConfig::new();
285    ///     let client = PmcTarClient::new(config);
286    ///     let output_dir = Path::new("./extracted_articles");
287    ///     let figures = client.extract_figures_with_captions("PMC7906746", output_dir).await?;
288    ///
289    ///     for figure in figures {
290    ///         println!("Figure {}: {}", figure.figure.id, figure.figure.caption);
291    ///         println!("File: {}", figure.extracted_file_path);
292    ///     }
293    ///     Ok(())
294    /// }
295    /// ```
296    #[cfg(not(target_arch = "wasm32"))]
297    pub async fn extract_figures_with_captions<P: AsRef<Path>>(
298        &self,
299        pmcid: &str,
300        output_dir: P,
301    ) -> Result<Vec<ExtractedFigure>> {
302        let normalized_pmcid = self.normalize_pmcid(pmcid);
303
304        // Create output directory early (before any potential failures)
305        let output_path = output_dir.as_ref();
306        tokio_fs::create_dir_all(output_path)
307            .await
308            .map_err(|e| ParseError::IoError {
309                message: format!("Failed to create output directory: {}", e),
310            })?;
311
312        // First, fetch the XML to get figure captions
313        let xml_content = self.fetch_xml(&normalized_pmcid).await?;
314        let full_text = parse_pmc_xml(&xml_content, &normalized_pmcid)?;
315
316        // Extract the tar.gz file
317        let extracted_files = self
318            .download_and_extract_tar(&normalized_pmcid, &output_dir)
319            .await?;
320
321        // Find and match figures
322        let figures = self
323            .match_figures_with_files(&full_text, &extracted_files, &output_dir)
324            .await?;
325
326        Ok(figures)
327    }
328
329    /// Fetch raw XML content from PMC
330    #[cfg(not(target_arch = "wasm32"))]
331    async fn fetch_xml(&self, pmcid: &str) -> Result<String> {
332        // Validate and parse PMC ID
333        let pmc_id = PmcId::parse(pmcid)?;
334        let normalized_pmcid = pmc_id.as_str();
335        let numeric_part = pmc_id.numeric_part();
336
337        // Build URL with API parameters
338        let mut url = format!(
339            "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=PMC{numeric_part}&retmode=xml"
340        );
341
342        // Add API parameters (API key, email, tool)
343        let api_params = self.config.build_api_params();
344        for (key, value) in api_params {
345            url.push('&');
346            url.push_str(&key);
347            url.push('=');
348            url.push_str(&urlencoding::encode(&value));
349        }
350
351        let response = self.make_request(&url).await?;
352
353        if !response.status().is_success() {
354            return Err(PubMedError::ApiError {
355                status: response.status().as_u16(),
356                message: response
357                    .status()
358                    .canonical_reason()
359                    .unwrap_or("Unknown error")
360                    .to_string(),
361            });
362        }
363
364        let xml_content = response.text().await?;
365
366        // Check if the response contains an error
367        if xml_content.contains("<ERROR>") {
368            return Err(ParseError::PmcNotAvailable {
369                id: normalized_pmcid,
370            }
371            .into());
372        }
373
374        Ok(xml_content)
375    }
376
377    /// Parse OA API XML response to extract download URL
378    #[cfg(not(target_arch = "wasm32"))]
379    fn parse_oa_response(&self, xml_content: &str, pmcid: &str) -> Result<String> {
380        use quick_xml::Reader;
381        use quick_xml::events::Event;
382
383        debug!("Parsing OA API XML response: {}", xml_content);
384
385        let mut reader = Reader::from_str(xml_content);
386        reader.config_mut().trim_text(true);
387
388        let mut buf = Vec::new();
389
390        loop {
391            match reader.read_event_into(&mut buf) {
392                Ok(Event::Start(ref e)) | Ok(Event::Empty(ref e))
393                    if e.name().as_ref() == b"link" =>
394                {
395                    debug!("Found link element");
396                    // Look for href attribute
397                    for attr in e.attributes().flatten() {
398                        debug!(
399                            "Attribute: {:?} = {:?}",
400                            str::from_utf8(attr.key.as_ref()).unwrap_or("invalid"),
401                            str::from_utf8(&attr.value).unwrap_or("invalid")
402                        );
403                        if attr.key.as_ref() == b"href" {
404                            let href = str::from_utf8(&attr.value).map_err(|e| {
405                                ParseError::XmlError(format!("Invalid UTF-8 in href: {}", e))
406                            })?;
407                            debug!("Found href: {}", href);
408                            return Ok(href.to_string());
409                        }
410                    }
411                }
412                Ok(Event::Eof) => break,
413                Err(e) => {
414                    return Err(ParseError::XmlError(format!("XML parsing error: {}", e)).into());
415                }
416                _ => {}
417            }
418            buf.clear();
419        }
420
421        debug!("No href attribute found in XML response");
422        Err(ParseError::PmcNotAvailable {
423            id: pmcid.to_string(),
424        }
425        .into())
426    }
427
428    /// Match figures from XML with extracted files
429    #[cfg(not(target_arch = "wasm32"))]
430    async fn match_figures_with_files<P: AsRef<Path>>(
431        &self,
432        full_text: &PmcArticle,
433        extracted_files: &[String],
434        output_dir: P,
435    ) -> Result<Vec<ExtractedFigure>> {
436        let output_path = output_dir.as_ref();
437        let mut matched_figures = Vec::new();
438
439        // Collect all figures from all sections
440        let mut all_figures = Vec::new();
441        for section in &full_text.sections {
442            Self::collect_figures_recursive(section, &mut all_figures);
443        }
444
445        // Common image extensions to look for
446        let image_extensions = [
447            "jpg", "jpeg", "png", "gif", "tiff", "tif", "svg", "eps", "pdf",
448        ];
449
450        for figure in all_figures {
451            // Try to find a matching file for this figure
452            let matching_file =
453                Self::find_matching_file(&figure, extracted_files, &image_extensions);
454
455            if let Some(file_path) = matching_file {
456                let absolute_path =
457                    if file_path.starts_with(&output_path.to_string_lossy().to_string()) {
458                        file_path.clone()
459                    } else {
460                        output_path.join(&file_path).to_string_lossy().to_string()
461                    };
462
463                // Get file size
464                let file_size = tokio_fs::metadata(&absolute_path)
465                    .await
466                    .map(|m| m.len())
467                    .ok();
468
469                // Try to get image dimensions
470                let dimensions = Self::get_image_dimensions(&absolute_path).await;
471
472                matched_figures.push(ExtractedFigure {
473                    figure: figure.clone(),
474                    extracted_file_path: absolute_path,
475                    file_size,
476                    dimensions,
477                });
478            }
479        }
480
481        Ok(matched_figures)
482    }
483
484    /// Recursively collect all figures from sections and subsections
485    #[cfg(not(target_arch = "wasm32"))]
486    fn collect_figures_recursive(section: &Section, figures: &mut Vec<Figure>) {
487        figures.extend(section.figures.clone());
488        for subsection in &section.subsections {
489            Self::collect_figures_recursive(subsection, figures);
490        }
491    }
492
493    /// Find a matching file for a figure based on ID, label, or filename patterns
494    #[cfg(not(target_arch = "wasm32"))]
495    pub fn find_matching_file(
496        figure: &Figure,
497        extracted_files: &[String],
498        image_extensions: &[&str],
499    ) -> Option<String> {
500        // First try to match by figure graphic_href if available
501        if let Some(file_name) = &figure.graphic_href {
502            for file_path in extracted_files {
503                if let Some(filename) = Path::new(file_path).file_name()
504                    && filename.to_string_lossy().contains(file_name)
505                {
506                    return Some(file_path.clone());
507                }
508            }
509        }
510
511        // Try to match by figure ID
512        for file_path in extracted_files {
513            if let Some(filename) = Path::new(file_path).file_name() {
514                let filename_str = filename.to_string_lossy().to_lowercase();
515                let figure_id_lower = figure.id.to_lowercase();
516
517                // Check if filename contains figure ID and has image extension
518                if filename_str.contains(&figure_id_lower)
519                    && let Some(extension) = Path::new(file_path).extension()
520                {
521                    let ext_str = extension.to_string_lossy().to_lowercase();
522                    if image_extensions.contains(&ext_str.as_str()) {
523                        return Some(file_path.clone());
524                    }
525                }
526            }
527        }
528
529        // Try to match by label if available
530        if let Some(label) = &figure.label {
531            let label_clean = label.to_lowercase().replace([' ', '.'], "");
532            for file_path in extracted_files {
533                if let Some(filename) = Path::new(file_path).file_name() {
534                    let filename_str = filename.to_string_lossy().to_lowercase();
535                    if filename_str.contains(&label_clean)
536                        && let Some(extension) = Path::new(file_path).extension()
537                    {
538                        let ext_str = extension.to_string_lossy().to_lowercase();
539                        if image_extensions.contains(&ext_str.as_str()) {
540                            return Some(file_path.clone());
541                        }
542                    }
543                }
544            }
545        }
546
547        None
548    }
549
550    /// Get image dimensions using the image crate
551    #[cfg(not(target_arch = "wasm32"))]
552    async fn get_image_dimensions(file_path: &str) -> Option<(u32, u32)> {
553        task::spawn_blocking({
554            let file_path = file_path.to_string();
555            move || {
556                image::open(&file_path)
557                    .ok()
558                    .map(|img| (img.width(), img.height()))
559            }
560        })
561        .await
562        .ok()
563        .flatten()
564    }
565
566    /// Extract tar.gz file to the specified directory
567    ///
568    /// # Arguments
569    ///
570    /// * `tar_path` - Path to the tar.gz file
571    /// * `output_dir` - Directory to extract contents to
572    ///
573    /// # Returns
574    ///
575    /// Returns a `Result<Vec<String>>` containing the list of extracted file paths
576    #[cfg(not(target_arch = "wasm32"))]
577    async fn extract_tar_gz<P: AsRef<Path>>(
578        &self,
579        tar_path: P,
580        output_dir: P,
581    ) -> Result<Vec<String>> {
582        let tar_path = tar_path.as_ref();
583        let output_dir = output_dir.as_ref();
584
585        // Read the tar.gz file
586        let tar_file = File::open(tar_path).map_err(|e| ParseError::IoError {
587            message: format!("Failed to open tar.gz file: {}", e),
588        })?;
589
590        let tar_gz = GzDecoder::new(tar_file);
591        let mut archive = Archive::new(tar_gz);
592
593        let mut extracted_files = Vec::new();
594
595        // Extract all entries
596        for entry in archive.entries().map_err(|e| ParseError::IoError {
597            message: format!("Failed to read tar entries: {}", e),
598        })? {
599            let mut entry = entry.map_err(|e| ParseError::IoError {
600                message: format!("Failed to read tar entry: {}", e),
601            })?;
602
603            let path = entry.path().map_err(|e| ParseError::IoError {
604                message: format!("Failed to get entry path: {}", e),
605            })?;
606
607            let output_path = output_dir.join(&path);
608
609            // Create parent directories if they don't exist
610            if let Some(parent) = output_path.parent() {
611                fs::create_dir_all(parent).map_err(|e| ParseError::IoError {
612                    message: format!("Failed to create parent directories: {}", e),
613                })?;
614            }
615
616            // Extract the entry
617            entry
618                .unpack(&output_path)
619                .map_err(|e| ParseError::IoError {
620                    message: format!("Failed to extract entry: {}", e),
621                })?;
622
623            extracted_files.push(output_path.to_string_lossy().to_string());
624            debug!("Extracted: {}", output_path.display());
625        }
626
627        Ok(extracted_files)
628    }
629
630    /// Normalize PMCID format (ensure it starts with "PMC")
631    fn normalize_pmcid(&self, pmcid: &str) -> String {
632        // Use PmcId for validation and normalization
633        // If parsing fails, fall back to the old behavior for backwards compatibility
634        PmcId::parse(pmcid)
635            .map(|id| id.as_str())
636            .unwrap_or_else(|_| {
637                if pmcid.starts_with("PMC") {
638                    pmcid.to_string()
639                } else {
640                    format!("PMC{pmcid}")
641                }
642            })
643    }
644
645    /// Internal helper method for making HTTP requests with retry logic
646    async fn make_request(&self, url: &str) -> Result<Response> {
647        with_retry(
648            || async {
649                self.rate_limiter.acquire().await?;
650                debug!("Making API request to: {url}");
651                let response = self
652                    .client
653                    .get(url)
654                    .send()
655                    .await
656                    .map_err(PubMedError::from)?;
657
658                // Check if response has server error status and convert to retryable error
659                if response.status().is_server_error() || response.status().as_u16() == 429 {
660                    return Err(PubMedError::ApiError {
661                        status: response.status().as_u16(),
662                        message: response
663                            .status()
664                            .canonical_reason()
665                            .unwrap_or("Unknown error")
666                            .to_string(),
667                    });
668                }
669
670                Ok(response)
671            },
672            &self.config.retry_config,
673            "NCBI API request",
674        )
675        .await
676    }
677}
678
679#[cfg(test)]
680mod tests {
681    use super::*;
682
683    #[test]
684    fn test_normalize_pmcid() {
685        let config = ClientConfig::new();
686        let client = PmcTarClient::new(config);
687
688        assert_eq!(client.normalize_pmcid("1234567"), "PMC1234567");
689        assert_eq!(client.normalize_pmcid("PMC1234567"), "PMC1234567");
690    }
691
692    #[test]
693    fn test_client_creation() {
694        let config = ClientConfig::new();
695        let _client = PmcTarClient::new(config);
696        // Test that client is created successfully
697    }
698}