1use std::{path::Path, str, time::Duration};
2
3use crate::common::PmcId;
4use crate::config::ClientConfig;
5use crate::error::{ParseError, PubMedError, Result};
6use crate::pmc::extracted::ExtractedFigure;
7use crate::pmc::parser::parse_pmc_xml;
8use crate::rate_limit::RateLimiter;
9use crate::retry::with_retry;
10use pubmed_parser::pmc::{Figure, PmcArticle, Section};
11use reqwest::{Client, Response};
12use tracing::debug;
13
14#[cfg(not(target_arch = "wasm32"))]
15use {
16 flate2::read::GzDecoder,
17 futures_util::StreamExt,
18 std::{fs, fs::File},
19 tar::Archive,
20 tokio::{fs as tokio_fs, io::AsyncWriteExt, task},
21};
22
23#[derive(Clone)]
25pub struct PmcTarClient {
26 client: Client,
27 rate_limiter: RateLimiter,
28 pub(crate) config: ClientConfig,
29}
30
31impl PmcTarClient {
32 pub fn new(config: ClientConfig) -> Self {
34 let rate_limiter = config.create_rate_limiter();
35
36 let client = {
37 #[cfg(not(target_arch = "wasm32"))]
38 {
39 Client::builder()
40 .user_agent(config.effective_user_agent())
41 .timeout(Duration::from_secs(config.timeout.as_secs()))
42 .build()
43 .expect("Failed to create HTTP client")
44 }
45
46 #[cfg(target_arch = "wasm32")]
47 {
48 Client::builder()
49 .user_agent(config.effective_user_agent())
50 .build()
51 .expect("Failed to create HTTP client")
52 }
53 };
54
55 Self {
56 client,
57 rate_limiter,
58 config,
59 }
60 }
61
62 #[cfg(not(target_arch = "wasm32"))]
101 pub async fn download_and_extract_tar<P: AsRef<Path>>(
102 &self,
103 pmcid: &str,
104 output_dir: P,
105 ) -> Result<Vec<String>> {
106 let pmc_id = PmcId::parse(pmcid)?;
108 let normalized_pmcid = pmc_id.as_str();
109
110 let output_path = output_dir.as_ref();
112 tokio_fs::create_dir_all(output_path)
113 .await
114 .map_err(|e| ParseError::IoError {
115 message: format!("Failed to create output directory: {}", e),
116 })?;
117
118 let mut url = format!(
120 "https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?id={}&format=tgz",
121 normalized_pmcid
122 );
123
124 let api_params = self.config.build_api_params();
126 for (key, value) in api_params {
127 url.push('&');
128 url.push_str(&key);
129 url.push('=');
130 url.push_str(&urlencoding::encode(&value));
131 }
132
133 debug!("Downloading tar.gz from OA API: {}", url);
134
135 let response = self.make_request(&url).await?;
137
138 if !response.status().is_success() {
139 return Err(PubMedError::ApiError {
140 status: response.status().as_u16(),
141 message: response
142 .status()
143 .canonical_reason()
144 .unwrap_or("Unknown error")
145 .to_string(),
146 });
147 }
148
149 let content_type = response
151 .headers()
152 .get("content-type")
153 .and_then(|v| v.to_str().ok())
154 .unwrap_or("");
155
156 debug!("OA API response content-type: {}", content_type);
157
158 let download_url =
159 if content_type.contains("text/xml") || content_type.contains("application/xml") {
160 let xml_content = response.text().await?;
162 debug!("OA API returned XML, parsing for download URL");
163 let parsed_url = self.parse_oa_response(&xml_content, pmcid)?;
164 if parsed_url.starts_with("ftp://ftp.ncbi.nlm.nih.gov/") {
166 parsed_url.replace(
167 "ftp://ftp.ncbi.nlm.nih.gov/",
168 "https://ftp.ncbi.nlm.nih.gov/",
169 )
170 } else {
171 parsed_url
172 }
173 } else if content_type.contains("application/x-gzip")
174 || content_type.contains("application/gzip")
175 {
176 url.clone()
178 } else {
179 let error_text = response.text().await?;
181 if error_text.contains("error") || error_text.contains("Error") {
182 return Err(ParseError::PmcNotAvailable {
183 id: pmcid.to_string(),
184 }
185 .into());
186 }
187 return Err(ParseError::PmcNotAvailable {
189 id: pmcid.to_string(),
190 }
191 .into());
192 };
193
194 let tar_response = self.make_request(&download_url).await?;
196
197 if !tar_response.status().is_success() {
198 return Err(PubMedError::ApiError {
199 status: tar_response.status().as_u16(),
200 message: tar_response
201 .status()
202 .canonical_reason()
203 .unwrap_or("Unknown error")
204 .to_string(),
205 });
206 }
207
208 let output_path = output_dir.as_ref();
210 tokio_fs::create_dir_all(output_path)
211 .await
212 .map_err(|e| ParseError::IoError {
213 message: format!("Failed to create output directory: {}", e),
214 })?;
215
216 let temp_file_path = output_path.join(format!("{}.tar.gz", normalized_pmcid));
218 let mut temp_file =
219 tokio_fs::File::create(&temp_file_path)
220 .await
221 .map_err(|e| ParseError::IoError {
222 message: format!("Failed to create temporary file: {}", e),
223 })?;
224
225 let mut stream = tar_response.bytes_stream();
226 while let Some(chunk) = stream.next().await {
227 let chunk = chunk.map_err(PubMedError::from)?;
228 temp_file
229 .write_all(&chunk)
230 .await
231 .map_err(|e| ParseError::IoError {
232 message: format!("Failed to write to temporary file: {}", e),
233 })?;
234 }
235
236 temp_file.flush().await.map_err(|e| ParseError::IoError {
237 message: format!("Failed to flush temporary file: {}", e),
238 })?;
239
240 debug!("Downloaded tar.gz to: {}", temp_file_path.display());
241
242 let extracted_files = self
244 .extract_tar_gz(&temp_file_path, &output_path.to_path_buf())
245 .await?;
246
247 tokio_fs::remove_file(&temp_file_path)
249 .await
250 .map_err(|e| ParseError::IoError {
251 message: format!("Failed to remove temporary file: {}", e),
252 })?;
253
254 Ok(extracted_files)
255 }
256
257 #[cfg(not(target_arch = "wasm32"))]
297 pub async fn extract_figures_with_captions<P: AsRef<Path>>(
298 &self,
299 pmcid: &str,
300 output_dir: P,
301 ) -> Result<Vec<ExtractedFigure>> {
302 let normalized_pmcid = self.normalize_pmcid(pmcid);
303
304 let output_path = output_dir.as_ref();
306 tokio_fs::create_dir_all(output_path)
307 .await
308 .map_err(|e| ParseError::IoError {
309 message: format!("Failed to create output directory: {}", e),
310 })?;
311
312 let xml_content = self.fetch_xml(&normalized_pmcid).await?;
314 let full_text = parse_pmc_xml(&xml_content, &normalized_pmcid)?;
315
316 let extracted_files = self
318 .download_and_extract_tar(&normalized_pmcid, &output_dir)
319 .await?;
320
321 let figures = self
323 .match_figures_with_files(&full_text, &extracted_files, &output_dir)
324 .await?;
325
326 Ok(figures)
327 }
328
329 #[cfg(not(target_arch = "wasm32"))]
331 async fn fetch_xml(&self, pmcid: &str) -> Result<String> {
332 let pmc_id = PmcId::parse(pmcid)?;
334 let normalized_pmcid = pmc_id.as_str();
335 let numeric_part = pmc_id.numeric_part();
336
337 let mut url = format!(
339 "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=PMC{numeric_part}&retmode=xml"
340 );
341
342 let api_params = self.config.build_api_params();
344 for (key, value) in api_params {
345 url.push('&');
346 url.push_str(&key);
347 url.push('=');
348 url.push_str(&urlencoding::encode(&value));
349 }
350
351 let response = self.make_request(&url).await?;
352
353 if !response.status().is_success() {
354 return Err(PubMedError::ApiError {
355 status: response.status().as_u16(),
356 message: response
357 .status()
358 .canonical_reason()
359 .unwrap_or("Unknown error")
360 .to_string(),
361 });
362 }
363
364 let xml_content = response.text().await?;
365
366 if xml_content.contains("<ERROR>") {
368 return Err(ParseError::PmcNotAvailable {
369 id: normalized_pmcid,
370 }
371 .into());
372 }
373
374 Ok(xml_content)
375 }
376
377 #[cfg(not(target_arch = "wasm32"))]
379 fn parse_oa_response(&self, xml_content: &str, pmcid: &str) -> Result<String> {
380 use quick_xml::Reader;
381 use quick_xml::events::Event;
382
383 debug!("Parsing OA API XML response: {}", xml_content);
384
385 let mut reader = Reader::from_str(xml_content);
386 reader.config_mut().trim_text(true);
387
388 let mut buf = Vec::new();
389
390 loop {
391 match reader.read_event_into(&mut buf) {
392 Ok(Event::Start(ref e)) | Ok(Event::Empty(ref e))
393 if e.name().as_ref() == b"link" =>
394 {
395 debug!("Found link element");
396 for attr in e.attributes().flatten() {
398 debug!(
399 "Attribute: {:?} = {:?}",
400 str::from_utf8(attr.key.as_ref()).unwrap_or("invalid"),
401 str::from_utf8(&attr.value).unwrap_or("invalid")
402 );
403 if attr.key.as_ref() == b"href" {
404 let href = str::from_utf8(&attr.value).map_err(|e| {
405 ParseError::XmlError(format!("Invalid UTF-8 in href: {}", e))
406 })?;
407 debug!("Found href: {}", href);
408 return Ok(href.to_string());
409 }
410 }
411 }
412 Ok(Event::Eof) => break,
413 Err(e) => {
414 return Err(ParseError::XmlError(format!("XML parsing error: {}", e)).into());
415 }
416 _ => {}
417 }
418 buf.clear();
419 }
420
421 debug!("No href attribute found in XML response");
422 Err(ParseError::PmcNotAvailable {
423 id: pmcid.to_string(),
424 }
425 .into())
426 }
427
428 #[cfg(not(target_arch = "wasm32"))]
430 async fn match_figures_with_files<P: AsRef<Path>>(
431 &self,
432 full_text: &PmcArticle,
433 extracted_files: &[String],
434 output_dir: P,
435 ) -> Result<Vec<ExtractedFigure>> {
436 let output_path = output_dir.as_ref();
437 let mut matched_figures = Vec::new();
438
439 let mut all_figures = Vec::new();
441 for section in &full_text.sections {
442 Self::collect_figures_recursive(section, &mut all_figures);
443 }
444
445 let image_extensions = [
447 "jpg", "jpeg", "png", "gif", "tiff", "tif", "svg", "eps", "pdf",
448 ];
449
450 for figure in all_figures {
451 let matching_file =
453 Self::find_matching_file(&figure, extracted_files, &image_extensions);
454
455 if let Some(file_path) = matching_file {
456 let absolute_path =
457 if file_path.starts_with(&output_path.to_string_lossy().to_string()) {
458 file_path.clone()
459 } else {
460 output_path.join(&file_path).to_string_lossy().to_string()
461 };
462
463 let file_size = tokio_fs::metadata(&absolute_path)
465 .await
466 .map(|m| m.len())
467 .ok();
468
469 let dimensions = Self::get_image_dimensions(&absolute_path).await;
471
472 matched_figures.push(ExtractedFigure {
473 figure: figure.clone(),
474 extracted_file_path: absolute_path,
475 file_size,
476 dimensions,
477 });
478 }
479 }
480
481 Ok(matched_figures)
482 }
483
484 #[cfg(not(target_arch = "wasm32"))]
486 fn collect_figures_recursive(section: &Section, figures: &mut Vec<Figure>) {
487 figures.extend(section.figures.clone());
488 for subsection in §ion.subsections {
489 Self::collect_figures_recursive(subsection, figures);
490 }
491 }
492
493 #[cfg(not(target_arch = "wasm32"))]
495 pub fn find_matching_file(
496 figure: &Figure,
497 extracted_files: &[String],
498 image_extensions: &[&str],
499 ) -> Option<String> {
500 if let Some(file_name) = &figure.graphic_href {
502 for file_path in extracted_files {
503 if let Some(filename) = Path::new(file_path).file_name()
504 && filename.to_string_lossy().contains(file_name)
505 {
506 return Some(file_path.clone());
507 }
508 }
509 }
510
511 for file_path in extracted_files {
513 if let Some(filename) = Path::new(file_path).file_name() {
514 let filename_str = filename.to_string_lossy().to_lowercase();
515 let figure_id_lower = figure.id.to_lowercase();
516
517 if filename_str.contains(&figure_id_lower)
519 && let Some(extension) = Path::new(file_path).extension()
520 {
521 let ext_str = extension.to_string_lossy().to_lowercase();
522 if image_extensions.contains(&ext_str.as_str()) {
523 return Some(file_path.clone());
524 }
525 }
526 }
527 }
528
529 if let Some(label) = &figure.label {
531 let label_clean = label.to_lowercase().replace([' ', '.'], "");
532 for file_path in extracted_files {
533 if let Some(filename) = Path::new(file_path).file_name() {
534 let filename_str = filename.to_string_lossy().to_lowercase();
535 if filename_str.contains(&label_clean)
536 && let Some(extension) = Path::new(file_path).extension()
537 {
538 let ext_str = extension.to_string_lossy().to_lowercase();
539 if image_extensions.contains(&ext_str.as_str()) {
540 return Some(file_path.clone());
541 }
542 }
543 }
544 }
545 }
546
547 None
548 }
549
550 #[cfg(not(target_arch = "wasm32"))]
552 async fn get_image_dimensions(file_path: &str) -> Option<(u32, u32)> {
553 task::spawn_blocking({
554 let file_path = file_path.to_string();
555 move || {
556 image::open(&file_path)
557 .ok()
558 .map(|img| (img.width(), img.height()))
559 }
560 })
561 .await
562 .ok()
563 .flatten()
564 }
565
566 #[cfg(not(target_arch = "wasm32"))]
577 async fn extract_tar_gz<P: AsRef<Path>>(
578 &self,
579 tar_path: P,
580 output_dir: P,
581 ) -> Result<Vec<String>> {
582 let tar_path = tar_path.as_ref();
583 let output_dir = output_dir.as_ref();
584
585 let tar_file = File::open(tar_path).map_err(|e| ParseError::IoError {
587 message: format!("Failed to open tar.gz file: {}", e),
588 })?;
589
590 let tar_gz = GzDecoder::new(tar_file);
591 let mut archive = Archive::new(tar_gz);
592
593 let mut extracted_files = Vec::new();
594
595 for entry in archive.entries().map_err(|e| ParseError::IoError {
597 message: format!("Failed to read tar entries: {}", e),
598 })? {
599 let mut entry = entry.map_err(|e| ParseError::IoError {
600 message: format!("Failed to read tar entry: {}", e),
601 })?;
602
603 let path = entry.path().map_err(|e| ParseError::IoError {
604 message: format!("Failed to get entry path: {}", e),
605 })?;
606
607 let output_path = output_dir.join(&path);
608
609 if let Some(parent) = output_path.parent() {
611 fs::create_dir_all(parent).map_err(|e| ParseError::IoError {
612 message: format!("Failed to create parent directories: {}", e),
613 })?;
614 }
615
616 entry
618 .unpack(&output_path)
619 .map_err(|e| ParseError::IoError {
620 message: format!("Failed to extract entry: {}", e),
621 })?;
622
623 extracted_files.push(output_path.to_string_lossy().to_string());
624 debug!("Extracted: {}", output_path.display());
625 }
626
627 Ok(extracted_files)
628 }
629
630 fn normalize_pmcid(&self, pmcid: &str) -> String {
632 PmcId::parse(pmcid)
635 .map(|id| id.as_str())
636 .unwrap_or_else(|_| {
637 if pmcid.starts_with("PMC") {
638 pmcid.to_string()
639 } else {
640 format!("PMC{pmcid}")
641 }
642 })
643 }
644
645 async fn make_request(&self, url: &str) -> Result<Response> {
647 with_retry(
648 || async {
649 self.rate_limiter.acquire().await?;
650 debug!("Making API request to: {url}");
651 let response = self
652 .client
653 .get(url)
654 .send()
655 .await
656 .map_err(PubMedError::from)?;
657
658 if response.status().is_server_error() || response.status().as_u16() == 429 {
660 return Err(PubMedError::ApiError {
661 status: response.status().as_u16(),
662 message: response
663 .status()
664 .canonical_reason()
665 .unwrap_or("Unknown error")
666 .to_string(),
667 });
668 }
669
670 Ok(response)
671 },
672 &self.config.retry_config,
673 "NCBI API request",
674 )
675 .await
676 }
677}
678
679#[cfg(test)]
680mod tests {
681 use super::*;
682
683 #[test]
684 fn test_normalize_pmcid() {
685 let config = ClientConfig::new();
686 let client = PmcTarClient::new(config);
687
688 assert_eq!(client.normalize_pmcid("1234567"), "PMC1234567");
689 assert_eq!(client.normalize_pmcid("PMC1234567"), "PMC1234567");
690 }
691
692 #[test]
693 fn test_client_creation() {
694 let config = ClientConfig::new();
695 let _client = PmcTarClient::new(config);
696 }
698}