pubmed_parser/pubmed/parser/
batch.rs

1//! Batch parsing for multiple PubMed articles
2//!
3//! This module provides functionality for parsing multiple PubMed articles
4//! from a single EFetch XML response, typically used with the history server.
5
6use super::preprocessing::strip_inline_html_tags;
7use super::xml_types::PubmedArticleSet;
8use crate::error::{ParseError, Result};
9use crate::pubmed::models::PubMedArticle;
10use quick_xml::de::from_str;
11use tracing::{instrument, warn};
12
13/// Parse multiple PubMed articles from EFetch XML response
14///
15/// This function parses an XML response containing multiple `<PubmedArticle>` elements,
16/// typically returned when fetching from the NCBI history server.
17///
18/// # Arguments
19///
20/// * `xml` - The raw XML string from PubMed EFetch API containing multiple articles
21///
22/// # Returns
23///
24/// A `Result<Vec<PubMedArticle>>` containing all successfully parsed articles.
25/// Articles that fail to parse are logged and skipped.
26///
27/// # Example
28///
29/// ```ignore
30/// use pubmed_client::pubmed::parser::parse_articles_from_xml;
31///
32/// let xml = r#"<?xml version="1.0"?>
33/// <PubmedArticleSet>
34///   <PubmedArticle>...</PubmedArticle>
35///   <PubmedArticle>...</PubmedArticle>
36/// </PubmedArticleSet>"#;
37///
38/// let articles = parse_articles_from_xml(xml)?;
39/// println!("Parsed {} articles", articles.len());
40/// ```
41#[instrument(skip(xml), fields(xml_size = xml.len()))]
42pub fn parse_articles_from_xml(xml: &str) -> Result<Vec<PubMedArticle>> {
43    // Preprocess XML to remove inline HTML tags that can cause parsing issues
44    let cleaned_xml = strip_inline_html_tags(xml);
45
46    // Parse the XML using quick-xml serde
47    let article_set: PubmedArticleSet = from_str(&cleaned_xml)
48        .map_err(|e| ParseError::XmlError(format!("Failed to deserialize XML: {}", e)))?;
49
50    // Convert all articles, skipping those that fail
51    let articles: Vec<PubMedArticle> = article_set
52        .articles
53        .into_iter()
54        .filter_map(|article_xml| {
55            let pmid = article_xml
56                .medline_citation
57                .pmid
58                .as_ref()
59                .map(|p| p.value.clone())?;
60
61            match article_xml.into_article(&pmid) {
62                Ok(article) => Some(article),
63                Err(e) => {
64                    warn!(pmid = %pmid, error = %e, "Failed to parse article, skipping");
65                    None
66                }
67            }
68        })
69        .collect();
70
71    Ok(articles)
72}
73
74#[cfg(test)]
75mod tests {
76    use super::*;
77
78    #[test]
79    fn test_parse_multiple_articles() {
80        let xml = r#"<?xml version="1.0" ?>
81<PubmedArticleSet>
82<PubmedArticle>
83    <MedlineCitation>
84        <PMID>12345678</PMID>
85        <Article>
86            <ArticleTitle>First Article</ArticleTitle>
87            <Journal><Title>Journal One</Title></Journal>
88        </Article>
89    </MedlineCitation>
90</PubmedArticle>
91<PubmedArticle>
92    <MedlineCitation>
93        <PMID>87654321</PMID>
94        <Article>
95            <ArticleTitle>Second Article</ArticleTitle>
96            <Journal><Title>Journal Two</Title></Journal>
97        </Article>
98    </MedlineCitation>
99</PubmedArticle>
100</PubmedArticleSet>"#;
101
102        let articles = parse_articles_from_xml(xml).unwrap();
103        assert_eq!(articles.len(), 2);
104        assert_eq!(articles[0].pmid, "12345678");
105        assert_eq!(articles[0].title, "First Article");
106        assert_eq!(articles[1].pmid, "87654321");
107        assert_eq!(articles[1].title, "Second Article");
108    }
109
110    #[test]
111    fn test_parse_empty_set() {
112        let xml = r#"<?xml version="1.0" ?>
113<PubmedArticleSet>
114</PubmedArticleSet>"#;
115
116        let articles = parse_articles_from_xml(xml).unwrap();
117        assert!(articles.is_empty());
118    }
119
120    #[test]
121    fn test_parse_with_inline_html() {
122        let xml = r#"<?xml version="1.0" ?>
123<PubmedArticleSet>
124<PubmedArticle>
125    <MedlineCitation>
126        <PMID>11111111</PMID>
127        <Article>
128            <ArticleTitle>Article with <i>italic</i> text</ArticleTitle>
129            <Abstract>
130                <AbstractText>Abstract with H<sub>2</sub>O formula</AbstractText>
131            </Abstract>
132            <Journal><Title>Test Journal</Title></Journal>
133        </Article>
134    </MedlineCitation>
135</PubmedArticle>
136</PubmedArticleSet>"#;
137
138        let articles = parse_articles_from_xml(xml).unwrap();
139        assert_eq!(articles.len(), 1);
140        assert!(articles[0].title.contains("italic"));
141    }
142}