pubmed_parser/pubmed/parser/
batch.rs1use super::preprocessing::strip_inline_html_tags;
7use super::xml_types::PubmedArticleSet;
8use crate::error::{ParseError, Result};
9use crate::pubmed::models::PubMedArticle;
10use quick_xml::de::from_str;
11use tracing::{instrument, warn};
12
13#[instrument(skip(xml), fields(xml_size = xml.len()))]
42pub fn parse_articles_from_xml(xml: &str) -> Result<Vec<PubMedArticle>> {
43 let cleaned_xml = strip_inline_html_tags(xml);
45
46 let article_set: PubmedArticleSet = from_str(&cleaned_xml)
48 .map_err(|e| ParseError::XmlError(format!("Failed to deserialize XML: {}", e)))?;
49
50 let articles: Vec<PubMedArticle> = article_set
52 .articles
53 .into_iter()
54 .filter_map(|article_xml| {
55 let pmid = article_xml
56 .medline_citation
57 .pmid
58 .as_ref()
59 .map(|p| p.value.clone())?;
60
61 match article_xml.into_article(&pmid) {
62 Ok(article) => Some(article),
63 Err(e) => {
64 warn!(pmid = %pmid, error = %e, "Failed to parse article, skipping");
65 None
66 }
67 }
68 })
69 .collect();
70
71 Ok(articles)
72}
73
74#[cfg(test)]
75mod tests {
76 use super::*;
77
78 #[test]
79 fn test_parse_multiple_articles() {
80 let xml = r#"<?xml version="1.0" ?>
81<PubmedArticleSet>
82<PubmedArticle>
83 <MedlineCitation>
84 <PMID>12345678</PMID>
85 <Article>
86 <ArticleTitle>First Article</ArticleTitle>
87 <Journal><Title>Journal One</Title></Journal>
88 </Article>
89 </MedlineCitation>
90</PubmedArticle>
91<PubmedArticle>
92 <MedlineCitation>
93 <PMID>87654321</PMID>
94 <Article>
95 <ArticleTitle>Second Article</ArticleTitle>
96 <Journal><Title>Journal Two</Title></Journal>
97 </Article>
98 </MedlineCitation>
99</PubmedArticle>
100</PubmedArticleSet>"#;
101
102 let articles = parse_articles_from_xml(xml).unwrap();
103 assert_eq!(articles.len(), 2);
104 assert_eq!(articles[0].pmid, "12345678");
105 assert_eq!(articles[0].title, "First Article");
106 assert_eq!(articles[1].pmid, "87654321");
107 assert_eq!(articles[1].title, "Second Article");
108 }
109
110 #[test]
111 fn test_parse_empty_set() {
112 let xml = r#"<?xml version="1.0" ?>
113<PubmedArticleSet>
114</PubmedArticleSet>"#;
115
116 let articles = parse_articles_from_xml(xml).unwrap();
117 assert!(articles.is_empty());
118 }
119
120 #[test]
121 fn test_parse_with_inline_html() {
122 let xml = r#"<?xml version="1.0" ?>
123<PubmedArticleSet>
124<PubmedArticle>
125 <MedlineCitation>
126 <PMID>11111111</PMID>
127 <Article>
128 <ArticleTitle>Article with <i>italic</i> text</ArticleTitle>
129 <Abstract>
130 <AbstractText>Abstract with H<sub>2</sub>O formula</AbstractText>
131 </Abstract>
132 <Journal><Title>Test Journal</Title></Journal>
133 </Article>
134 </MedlineCitation>
135</PubmedArticle>
136</PubmedArticleSet>"#;
137
138 let articles = parse_articles_from_xml(xml).unwrap();
139 assert_eq!(articles.len(), 1);
140 assert!(articles[0].title.contains("italic"));
141 }
142}