1mod batch;
21mod converters;
22mod deserializers;
23mod extractors;
24mod preprocessing;
25mod xml_types;
26
27pub(crate) use preprocessing::strip_inline_html_tags;
29
30pub use batch::parse_articles_from_xml;
32
33use crate::error::{ParseError, Result};
34use crate::pubmed::models::PubMedArticle;
35use quick_xml::de::from_str;
36use tracing::instrument;
37use xml_types::PubmedArticleSet;
38
39#[instrument(skip(xml), fields(pmid = %pmid, xml_size = xml.len()))]
82pub fn parse_article_from_xml(xml: &str, pmid: &str) -> Result<PubMedArticle> {
83 let cleaned_xml = strip_inline_html_tags(xml);
86
87 let article_set: PubmedArticleSet = from_str(&cleaned_xml)
89 .map_err(|e| ParseError::XmlError(format!("Failed to deserialize XML: {}", e)))?;
90
91 let article_xml = article_set
93 .articles
94 .into_iter()
95 .find(|a| {
96 a.medline_citation
97 .pmid
98 .as_ref()
99 .is_some_and(|p| p.value == pmid)
100 })
101 .ok_or_else(|| ParseError::ArticleNotFound {
102 pmid: pmid.to_string(),
103 })?;
104
105 article_xml.into_article(pmid)
106}
107
108#[cfg(test)]
109mod tests {
110 use super::*;
111
112 #[test]
113 fn test_mesh_term_parsing() {
114 let xml = r#"<?xml version="1.0" ?>
115<!DOCTYPE PubmedArticleSet PUBLIC "-//NLM//DTD PubMedArticle, 1st January 2023//EN" "https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_230101.dtd">
116<PubmedArticleSet>
117<PubmedArticle>
118 <MedlineCitation Status="PubMed-not-MEDLINE" Owner="NLM">
119 <PMID Version="1">12345678</PMID>
120 <Article>
121 <ArticleTitle>Test Article with MeSH Terms</ArticleTitle>
122 <Abstract>
123 <AbstractText>This is a test abstract.</AbstractText>
124 </Abstract>
125 <AuthorList>
126 <Author>
127 <LastName>Doe</LastName>
128 <ForeName>John</ForeName>
129 <Initials>JA</Initials>
130 <AffiliationInfo>
131 <Affiliation>Department of Medicine, Harvard Medical School, Boston, MA, USA. john.doe@hms.harvard.edu</Affiliation>
132 </AffiliationInfo>
133 <Identifier Source="ORCID">0000-0001-2345-6789</Identifier>
134 </Author>
135 </AuthorList>
136 <Journal>
137 <Title>Test Journal</Title>
138 </Journal>
139 </Article>
140 <MeshHeadingList>
141 <MeshHeading>
142 <DescriptorName UI="D003920" MajorTopicYN="Y">Diabetes Mellitus</DescriptorName>
143 <QualifierName UI="Q000188" MajorTopicYN="N">drug therapy</QualifierName>
144 </MeshHeading>
145 <MeshHeading>
146 <DescriptorName UI="D007333" MajorTopicYN="N">Insulin</DescriptorName>
147 </MeshHeading>
148 </MeshHeadingList>
149 <ChemicalList>
150 <Chemical>
151 <RegistryNumber>11061-68-0</RegistryNumber>
152 <NameOfSubstance UI="D007328">Insulin</NameOfSubstance>
153 </Chemical>
154 </ChemicalList>
155 <KeywordList>
156 <Keyword>diabetes treatment</Keyword>
157 <Keyword>insulin therapy</Keyword>
158 </KeywordList>
159 </MedlineCitation>
160</PubmedArticle>
161</PubmedArticleSet>"#;
162
163 let article = parse_article_from_xml(xml, "12345678").unwrap();
164
165 assert!(article.mesh_headings.is_some());
167 let mesh_headings = article.mesh_headings.as_ref().unwrap();
168 assert_eq!(mesh_headings.len(), 2);
169
170 let first_heading = &mesh_headings[0];
172 assert_eq!(first_heading.mesh_terms.len(), 1);
173 let diabetes_term = &first_heading.mesh_terms[0];
174 assert_eq!(diabetes_term.descriptor_name, "Diabetes Mellitus");
175 assert_eq!(diabetes_term.descriptor_ui, "D003920");
176 assert!(diabetes_term.major_topic);
177 assert_eq!(diabetes_term.qualifiers.len(), 1);
178 assert_eq!(diabetes_term.qualifiers[0].qualifier_name, "drug therapy");
179 assert_eq!(diabetes_term.qualifiers[0].qualifier_ui, "Q000188");
180 assert!(!diabetes_term.qualifiers[0].major_topic);
181
182 let second_heading = &mesh_headings[1];
184 assert_eq!(second_heading.mesh_terms.len(), 1);
185 let insulin_term = &second_heading.mesh_terms[0];
186 assert_eq!(insulin_term.descriptor_name, "Insulin");
187 assert_eq!(insulin_term.descriptor_ui, "D007333");
188 assert!(!insulin_term.major_topic);
189 assert_eq!(insulin_term.qualifiers.len(), 0);
190
191 assert!(article.chemical_list.is_some());
193 let chemicals = article.chemical_list.as_ref().unwrap();
194 assert_eq!(chemicals.len(), 1);
195 assert_eq!(chemicals[0].name, "Insulin");
196 assert_eq!(chemicals[0].registry_number, Some("11061-68-0".to_string()));
197 assert_eq!(chemicals[0].ui, Some("D007328".to_string()));
198
199 assert_eq!(article.authors.len(), 1);
201 assert_eq!(article.author_count, 1);
202 let author = &article.authors[0];
203 assert_eq!(author.surname, Some("Doe".to_string()));
204 assert_eq!(author.given_names, Some("John".to_string()));
205 assert_eq!(author.initials, Some("JA".to_string()));
206 assert_eq!(author.full_name, "John Doe");
207 assert_eq!(author.orcid, Some("0000-0001-2345-6789".to_string()));
208 assert_eq!(author.affiliations.len(), 1);
209 assert!(
210 author.affiliations[0]
211 .institution
212 .as_ref()
213 .unwrap()
214 .contains("Harvard Medical School")
215 );
216
217 assert!(article.keywords.is_some());
219 let keywords = article.keywords.as_ref().unwrap();
220 assert_eq!(keywords.len(), 2);
221 assert_eq!(keywords[0], "diabetes treatment");
222 assert_eq!(keywords[1], "insulin therapy");
223 }
224
225 #[test]
226 fn test_structured_abstract_parsing() {
227 let xml = r#"
228 <PubmedArticleSet>
229 <PubmedArticle>
230 <MedlineCitation>
231 <PMID>32887691</PMID>
232 <Article>
233 <ArticleTitle>A living WHO guideline on drugs for covid-19.</ArticleTitle>
234 <Abstract>
235 <AbstractText Label="UPDATES">This is the fourteenth version (thirteenth update) of the living guideline, replacing earlier versions.</AbstractText>
236 <AbstractText Label="CLINICAL QUESTION">What is the role of drugs in the treatment of patients with covid-19?</AbstractText>
237 <AbstractText Label="CONTEXT">The evidence base for therapeutics for covid-19 is evolving with numerous randomised controlled trials.</AbstractText>
238 </Abstract>
239 <Journal>
240 <Title>BMJ (Clinical research ed.)</Title>
241 <JournalIssue>
242 <PubDate>
243 <Year>2020</Year>
244 <Month>Sep</Month>
245 </PubDate>
246 </JournalIssue>
247 </Journal>
248 </Article>
249 </MedlineCitation>
250 </PubmedArticle>
251 </PubmedArticleSet>"#;
252
253 let result = parse_article_from_xml(xml, "32887691");
254 assert!(result.is_ok());
255
256 let article = result.unwrap();
257 assert_eq!(article.pmid, "32887691");
258 assert_eq!(
259 article.title,
260 "A living WHO guideline on drugs for covid-19."
261 );
262
263 let abstract_text = article.abstract_text.unwrap();
265 assert!(abstract_text.contains("This is the fourteenth version"));
266 assert!(abstract_text.contains("What is the role of drugs"));
267 assert!(abstract_text.contains("The evidence base for therapeutics"));
268
269 assert!(abstract_text.contains("earlier versions. What is the role"));
271 assert!(abstract_text.contains("covid-19? The evidence base"));
272 }
273
274 #[test]
275 fn test_abstract_with_inline_html_tags() {
276 let xml = r#"<?xml version="1.0" ?>
279<PubmedArticleSet>
280<PubmedArticle>
281 <MedlineCitation>
282 <PMID>41111388</PMID>
283 <Article>
284 <ArticleTitle>Breath analysis with inline formatting</ArticleTitle>
285 <Abstract>
286 <AbstractText>This study presents a novel approach (<i>e.g.</i>, machine learning algorithms) for comprehensive analysis. The method uses H<sub>2</sub>O and CO<sub>2</sub> detection with sensitivity of 10<sup>-9</sup> parts per billion.</AbstractText>
287 </Abstract>
288 <Journal>
289 <Title>Test Journal</Title>
290 <JournalIssue>
291 <PubDate>
292 <Year>2024</Year>
293 </PubDate>
294 </JournalIssue>
295 </Journal>
296 </Article>
297 </MedlineCitation>
298</PubmedArticle>
299</PubmedArticleSet>"#;
300
301 let result = parse_article_from_xml(xml, "41111388");
303 assert!(
304 result.is_ok(),
305 "Failed to parse XML with inline HTML tags: {:?}",
306 result
307 );
308
309 let article = result.unwrap();
310 assert_eq!(article.pmid, "41111388");
311
312 let abstract_text = article.abstract_text.as_ref();
314 assert!(abstract_text.is_some(), "Abstract text should not be None");
315
316 let text = abstract_text.unwrap();
317
318 assert!(
321 text.contains("machine learning algorithms"),
322 "Abstract should contain main text content. Got: {}",
323 text
324 );
325 assert!(
326 text.contains("comprehensive analysis"),
327 "Abstract should contain regular text. Got: {}",
328 text
329 );
330 assert!(
331 text.contains("parts per billion"),
332 "Abstract should contain ending text. Got: {}",
333 text
334 );
335 }
336
337 #[test]
338 fn test_structured_abstract_with_inline_tags() {
339 let xml = r#"<?xml version="1.0" ?>
341<PubmedArticleSet>
342<PubmedArticle>
343 <MedlineCitation>
344 <PMID>99999999</PMID>
345 <Article>
346 <ArticleTitle>Study with formatted abstract sections</ArticleTitle>
347 <Abstract>
348 <AbstractText Label="BACKGROUND">CRISPR-Cas systems (<i>e.g.</i>, Cas9) are revolutionary.</AbstractText>
349 <AbstractText Label="METHODS">We used <sup>13</sup>C isotope labeling and analyzed pH levels between 5.0-7.5.</AbstractText>
350 <AbstractText Label="RESULTS">Efficacy improved by 10<sup>3</sup>-fold with <i>in vitro</i> conditions.</AbstractText>
351 </Abstract>
352 <Journal>
353 <Title>Test Journal</Title>
354 </Journal>
355 </Article>
356 </MedlineCitation>
357</PubmedArticle>
358</PubmedArticleSet>"#;
359
360 let result = parse_article_from_xml(xml, "99999999");
361 assert!(
362 result.is_ok(),
363 "Failed to parse structured abstract with inline tags"
364 );
365
366 let article = result.unwrap();
367 let abstract_text = article.abstract_text.unwrap();
368
369 assert!(
371 abstract_text.contains("CRISPR-Cas systems"),
372 "Should extract BACKGROUND content"
373 );
374 assert!(
375 abstract_text.contains("Cas9"),
376 "Should extract text adjacent to inline tags"
377 );
378 assert!(
379 abstract_text.contains("isotope labeling"),
380 "Should extract METHODS content"
381 );
382
383 assert!(
385 abstract_text.contains("revolutionary") && abstract_text.contains("isotope"),
386 "Should concatenate all sections"
387 );
388 }
389
390 #[test]
391 fn test_article_without_mesh_terms() {
392 let xml = r#"<?xml version="1.0" ?>
393<!DOCTYPE PubmedArticleSet PUBLIC "-//NLM//DTD PubMedArticle, 1st January 2023//EN" "https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_230101.dtd">
394<PubmedArticleSet>
395<PubmedArticle>
396 <MedlineCitation Status="PubMed-not-MEDLINE" Owner="NLM">
397 <PMID Version="1">87654321</PMID>
398 <Article>
399 <ArticleTitle>Article Without MeSH Terms</ArticleTitle>
400 <AuthorList>
401 <Author>
402 <LastName>Smith</LastName>
403 <ForeName>Jane</ForeName>
404 </Author>
405 </AuthorList>
406 <Journal>
407 <Title>Another Journal</Title>
408 </Journal>
409 </Article>
410 </MedlineCitation>
411</PubmedArticle>
412</PubmedArticleSet>"#;
413
414 let article = parse_article_from_xml(xml, "87654321").unwrap();
415
416 assert_eq!(article.authors.len(), 1);
417 assert_eq!(article.author_count, 1);
418 assert_eq!(article.authors[0].full_name, "Jane Smith");
419 assert!(article.mesh_headings.is_none());
420 assert!(article.chemical_list.is_none());
421 assert!(article.keywords.is_none());
422 }
423
424 fn assert_xml_field_extracted(
439 xml: &str,
440 xml_tag: &str,
441 field_value: &Option<String>,
442 field_name: &str,
443 ) {
444 let tag_open = format!("<{}", xml_tag);
445 if xml.contains(&tag_open) {
446 assert!(
447 field_value.is_some(),
448 "XML contains <{}> but parsed `{}` is None — parser silently dropped the field",
449 xml_tag,
450 field_name,
451 );
452 }
453 }
454
455 fn assert_bibliographic_fields_cross_check(xml: &str, article: &PubMedArticle) {
457 assert_xml_field_extracted(xml, "Volume", &article.volume, "volume");
458 assert_xml_field_extracted(xml, "Issue", &article.issue, "issue");
459 assert_xml_field_extracted(xml, "MedlinePgn", &article.pages, "pages");
460 assert_xml_field_extracted(xml, "Language", &article.language, "language");
461 assert_xml_field_extracted(
462 xml,
463 "ISOAbbreviation",
464 &article.journal_abbreviation,
465 "journal_abbreviation",
466 );
467 if xml.contains("<ISSN") && xml.contains("<Journal>") {
469 assert!(
470 article.issn.is_some(),
471 "XML contains <ISSN> within <Journal> but parsed `issn` is None",
472 );
473 }
474 }
475
476 #[test]
477 fn test_bibliographic_fields_all_present() {
478 let xml = r#"<?xml version="1.0" ?>
479<PubmedArticleSet>
480<PubmedArticle>
481 <MedlineCitation>
482 <PMID>31978945</PMID>
483 <Article>
484 <Journal>
485 <ISSN IssnType="Electronic">1476-4687</ISSN>
486 <JournalIssue CitedMedium="Internet">
487 <Volume>579</Volume>
488 <Issue>7798</Issue>
489 <PubDate>
490 <Year>2020</Year>
491 <Month>Mar</Month>
492 </PubDate>
493 </JournalIssue>
494 <Title>Nature</Title>
495 <ISOAbbreviation>Nature</ISOAbbreviation>
496 </Journal>
497 <ArticleTitle>A pneumonia outbreak associated with a new coronavirus of probable bat origin.</ArticleTitle>
498 <Pagination>
499 <MedlinePgn>270-273</MedlinePgn>
500 </Pagination>
501 <Language>eng</Language>
502 </Article>
503 </MedlineCitation>
504</PubmedArticle>
505</PubmedArticleSet>"#;
506
507 let article = parse_article_from_xml(xml, "31978945").unwrap();
508
509 assert_bibliographic_fields_cross_check(xml, &article);
511
512 assert_eq!(article.volume.as_deref(), Some("579"));
514 assert_eq!(article.issue.as_deref(), Some("7798"));
515 assert_eq!(article.pages.as_deref(), Some("270-273"));
516 assert_eq!(article.language.as_deref(), Some("eng"));
517 assert_eq!(article.journal_abbreviation.as_deref(), Some("Nature"));
518 assert_eq!(article.issn.as_deref(), Some("1476-4687"));
519 }
520
521 #[test]
522 fn test_bibliographic_fields_all_absent() {
523 let xml = r#"<?xml version="1.0" ?>
524<PubmedArticleSet>
525<PubmedArticle>
526 <MedlineCitation>
527 <PMID>99990001</PMID>
528 <Article>
529 <Journal>
530 <Title>Minimal Journal</Title>
531 </Journal>
532 <ArticleTitle>Minimal Article</ArticleTitle>
533 </Article>
534 </MedlineCitation>
535</PubmedArticle>
536</PubmedArticleSet>"#;
537
538 let article = parse_article_from_xml(xml, "99990001").unwrap();
539
540 assert_bibliographic_fields_cross_check(xml, &article);
542
543 assert!(article.volume.is_none());
545 assert!(article.issue.is_none());
546 assert!(article.pages.is_none());
547 assert!(article.language.is_none());
548 assert!(article.journal_abbreviation.is_none());
549 assert!(article.issn.is_none());
550 }
551
552 #[test]
553 fn test_bibliographic_fields_partial() {
554 let xml = r#"<?xml version="1.0" ?>
556<PubmedArticleSet>
557<PubmedArticle>
558 <MedlineCitation>
559 <PMID>99990002</PMID>
560 <Article>
561 <Journal>
562 <ISSN IssnType="Print">0028-0836</ISSN>
563 <JournalIssue>
564 <Volume>100</Volume>
565 <PubDate>
566 <Year>2023</Year>
567 </PubDate>
568 </JournalIssue>
569 <Title>Test Journal of Medicine</Title>
570 <ISOAbbreviation>Test J Med</ISOAbbreviation>
571 </Journal>
572 <ArticleTitle>Partial Fields Article</ArticleTitle>
573 <Language>jpn</Language>
574 </Article>
575 </MedlineCitation>
576</PubmedArticle>
577</PubmedArticleSet>"#;
578
579 let article = parse_article_from_xml(xml, "99990002").unwrap();
580
581 assert_bibliographic_fields_cross_check(xml, &article);
583
584 assert_eq!(article.volume.as_deref(), Some("100"));
586 assert_eq!(article.language.as_deref(), Some("jpn"));
587 assert_eq!(article.journal_abbreviation.as_deref(), Some("Test J Med"));
588 assert_eq!(article.issn.as_deref(), Some("0028-0836"));
589
590 assert!(article.issue.is_none(), "No <Issue> in XML, must be None");
592 assert!(
593 article.pages.is_none(),
594 "No <MedlinePgn> in XML, must be None"
595 );
596 }
597
598 #[test]
599 fn test_bibliographic_fields_batch_extraction_rate() {
600 let xml = r#"<?xml version="1.0" ?>
604<PubmedArticleSet>
605<PubmedArticle>
606 <MedlineCitation>
607 <PMID>10000001</PMID>
608 <Article>
609 <Journal>
610 <ISSN IssnType="Electronic">1111-2222</ISSN>
611 <JournalIssue>
612 <Volume>10</Volume>
613 <Issue>1</Issue>
614 <PubDate><Year>2020</Year></PubDate>
615 </JournalIssue>
616 <Title>Journal Alpha</Title>
617 <ISOAbbreviation>J Alpha</ISOAbbreviation>
618 </Journal>
619 <ArticleTitle>Article One</ArticleTitle>
620 <Pagination><MedlinePgn>1-10</MedlinePgn></Pagination>
621 <Language>eng</Language>
622 </Article>
623 </MedlineCitation>
624</PubmedArticle>
625<PubmedArticle>
626 <MedlineCitation>
627 <PMID>10000002</PMID>
628 <Article>
629 <Journal>
630 <ISSN IssnType="Print">3333-4444</ISSN>
631 <JournalIssue>
632 <Volume>25</Volume>
633 <Issue>12</Issue>
634 <PubDate><Year>2021</Year><Month>Dec</Month></PubDate>
635 </JournalIssue>
636 <Title>Journal Beta</Title>
637 <ISOAbbreviation>J Beta</ISOAbbreviation>
638 </Journal>
639 <ArticleTitle>Article Two</ArticleTitle>
640 <Pagination><MedlinePgn>100-115</MedlinePgn></Pagination>
641 <Language>fre</Language>
642 </Article>
643 </MedlineCitation>
644</PubmedArticle>
645<PubmedArticle>
646 <MedlineCitation>
647 <PMID>10000003</PMID>
648 <Article>
649 <Journal>
650 <ISSN IssnType="Electronic">5555-6666</ISSN>
651 <JournalIssue>
652 <Volume>8</Volume>
653 <Issue>4</Issue>
654 <PubDate><Year>2023</Year><Month>Apr</Month><Day>01</Day></PubDate>
655 </JournalIssue>
656 <Title>Journal Gamma</Title>
657 <ISOAbbreviation>J Gamma</ISOAbbreviation>
658 </Journal>
659 <ArticleTitle>Article Three</ArticleTitle>
660 <Pagination><MedlinePgn>e2023001</MedlinePgn></Pagination>
661 <Language>jpn</Language>
662 </Article>
663 </MedlineCitation>
664</PubmedArticle>
665</PubmedArticleSet>"#;
666
667 use crate::pubmed::parser::batch::parse_articles_from_xml;
668 let articles = parse_articles_from_xml(xml).unwrap();
669 assert_eq!(articles.len(), 3, "Should parse all 3 articles");
670
671 let mut counts = [0u32; 6]; for article in &articles {
674 if article.volume.is_some() {
675 counts[0] += 1;
676 }
677 if article.issue.is_some() {
678 counts[1] += 1;
679 }
680 if article.pages.is_some() {
681 counts[2] += 1;
682 }
683 if article.language.is_some() {
684 counts[3] += 1;
685 }
686 if article.journal_abbreviation.is_some() {
687 counts[4] += 1;
688 }
689 if article.issn.is_some() {
690 counts[5] += 1;
691 }
692 }
693
694 let n = articles.len() as u32;
695 let field_names = [
696 "volume",
697 "issue",
698 "pages",
699 "language",
700 "journal_abbreviation",
701 "issn",
702 ];
703 for (i, name) in field_names.iter().enumerate() {
704 assert_eq!(
705 counts[i], n,
706 "All {} articles have <{}> in XML but only {} were extracted (expected {})",
707 n, name, counts[i], n,
708 );
709 }
710
711 let a1 = articles.iter().find(|a| a.pmid == "10000001").unwrap();
713 assert_eq!(a1.volume.as_deref(), Some("10"));
714 assert_eq!(a1.issue.as_deref(), Some("1"));
715 assert_eq!(a1.pages.as_deref(), Some("1-10"));
716 assert_eq!(a1.language.as_deref(), Some("eng"));
717 assert_eq!(a1.journal_abbreviation.as_deref(), Some("J Alpha"));
718 assert_eq!(a1.issn.as_deref(), Some("1111-2222"));
719
720 let a3 = articles.iter().find(|a| a.pmid == "10000003").unwrap();
721 assert_eq!(a3.volume.as_deref(), Some("8"));
722 assert_eq!(a3.issue.as_deref(), Some("4"));
723 assert_eq!(a3.pages.as_deref(), Some("e2023001"));
724 assert_eq!(a3.language.as_deref(), Some("jpn"));
725 assert_eq!(a3.journal_abbreviation.as_deref(), Some("J Gamma"));
726 assert_eq!(a3.issn.as_deref(), Some("5555-6666"));
727 }
728
729 #[test]
730 fn test_bibliographic_fields_nlm_citation() {
731 let xml = r#"<?xml version="1.0" ?>
733<PubmedArticleSet>
734<PubmedArticle>
735 <MedlineCitation>
736 <PMID>99990003</PMID>
737 <Article>
738 <Journal>
739 <ISSN IssnType="Electronic">1234-5678</ISSN>
740 <JournalIssue CitedMedium="Internet">
741 <Volume>45</Volume>
742 <Issue>3</Issue>
743 <PubDate>
744 <Year>2024</Year>
745 <Month>Jun</Month>
746 <Day>15</Day>
747 </PubDate>
748 </JournalIssue>
749 <Title>Journal of Biological Chemistry</Title>
750 <ISOAbbreviation>J Biol Chem</ISOAbbreviation>
751 </Journal>
752 <ArticleTitle>Complete Citation Test Article.</ArticleTitle>
753 <Pagination>
754 <MedlinePgn>e100234</MedlinePgn>
755 </Pagination>
756 <AuthorList>
757 <Author>
758 <LastName>Tanaka</LastName>
759 <ForeName>Yuki</ForeName>
760 </Author>
761 <Author>
762 <LastName>Suzuki</LastName>
763 <ForeName>Kenji</ForeName>
764 </Author>
765 </AuthorList>
766 <Language>eng</Language>
767 </Article>
768 </MedlineCitation>
769</PubmedArticle>
770</PubmedArticleSet>"#;
771
772 let article = parse_article_from_xml(xml, "99990003").unwrap();
773
774 assert_bibliographic_fields_cross_check(xml, &article);
776
777 let citation = format!(
779 "{}. {}. {} {};{}({}):{}.",
780 article.authors[0].full_name,
781 article.title,
782 article
783 .journal_abbreviation
784 .as_deref()
785 .unwrap_or(&article.journal),
786 "2024",
787 article.volume.as_deref().unwrap_or(""),
788 article.issue.as_deref().unwrap_or(""),
789 article.pages.as_deref().unwrap_or(""),
790 );
791 assert_eq!(
792 citation,
793 "Yuki Tanaka. Complete Citation Test Article.. J Biol Chem 2024;45(3):e100234."
794 );
795 }
796
797 #[test]
798 fn test_parse_article_with_abstract() {
799 let xml = r#"<?xml version="1.0" ?>
800<!DOCTYPE PubmedArticleSet PUBLIC "-//NLM//DTD PubMedArticle, 1st January 2025//EN" "https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_250101.dtd">
801<PubmedArticleSet>
802<PubmedArticle>
803<MedlineCitation Status="MEDLINE" Owner="NLM" IndexingMethod="Manual">
804<PMID Version="1">31978945</PMID>
805<Article PubModel="Print-Electronic">
806<Journal>
807<Title>The New England journal of medicine</Title>
808</Journal>
809<ArticleTitle>A Novel Coronavirus from Patients with Pneumonia in China, 2019.</ArticleTitle>
810<Abstract>
811<AbstractText>In December 2019, a cluster of patients with pneumonia of unknown cause was linked to a seafood wholesale market in Wuhan, China. A previously unknown betacoronavirus was discovered through the use of unbiased sequencing in samples from patients with pneumonia. Human airway epithelial cells were used to isolate a novel coronavirus, named 2019-nCoV, which formed a clade within the subgenus sarbecovirus, Orthocoronavirinae subfamily. Different from both MERS-CoV and SARS-CoV, 2019-nCoV is the seventh member of the family of coronaviruses that infect humans. Enhanced surveillance and further investigation are ongoing. (Funded by the National Key Research and Development Program of China and the National Major Project for Control and Prevention of Infectious Disease in China.).</AbstractText>
812</Abstract>
813<AuthorList CompleteYN="Y">
814<Author ValidYN="Y">
815<LastName>Zhu</LastName>
816<ForeName>Na</ForeName>
817</Author>
818<Author ValidYN="Y">
819<LastName>Zhang</LastName>
820<ForeName>Dingyu</ForeName>
821</Author>
822</AuthorList>
823<PublicationTypeList>
824<PublicationType UI="D016428">Journal Article</PublicationType>
825</PublicationTypeList>
826</Article>
827</MedlineCitation>
828</PubmedArticle>
829</PubmedArticleSet>"#;
830
831 let article = parse_article_from_xml(xml, "31978945").unwrap();
832
833 assert_eq!(article.pmid, "31978945");
834 assert_eq!(
835 article.title,
836 "A Novel Coronavirus from Patients with Pneumonia in China, 2019."
837 );
838 assert_eq!(article.journal, "The New England journal of medicine");
839 assert_eq!(article.authors.len(), 2);
840 assert_eq!(article.authors[0].full_name, "Na Zhu");
841 assert_eq!(article.authors[1].full_name, "Dingyu Zhang");
842 assert_eq!(article.article_types, vec!["Journal Article"]);
843
844 assert!(article.abstract_text.is_some());
845 let abstract_text = article.abstract_text.unwrap();
846 assert!(abstract_text.contains("In December 2019"));
847 assert!(abstract_text.contains("2019-nCoV"));
848 assert!(
849 abstract_text.contains("Enhanced surveillance and further investigation are ongoing")
850 );
851 }
852
853 #[test]
854 fn test_parse_article_without_abstract() {
855 let xml = r#"<?xml version="1.0" ?>
856<!DOCTYPE PubmedArticleSet PUBLIC "-//NLM//DTD PubMedArticle, 1st January 2025//EN" "https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_250101.dtd">
857<PubmedArticleSet>
858<PubmedArticle>
859<MedlineCitation Status="MEDLINE" Owner="NLM" IndexingMethod="Manual">
860<PMID Version="1">33515491</PMID>
861<Article PubModel="Print-Electronic">
862<Journal>
863<Title>Lancet (London, England)</Title>
864</Journal>
865<ArticleTitle>Resurgence of COVID-19 in Manaus, Brazil, despite high seroprevalence.</ArticleTitle>
866<AuthorList CompleteYN="Y">
867<Author ValidYN="Y">
868<LastName>Sabino</LastName>
869<ForeName>Ester C</ForeName>
870</Author>
871</AuthorList>
872<PublicationTypeList>
873<PublicationType UI="D016428">Journal Article</PublicationType>
874</PublicationTypeList>
875</Article>
876</MedlineCitation>
877</PubmedArticle>
878</PubmedArticleSet>"#;
879
880 let article = parse_article_from_xml(xml, "33515491").unwrap();
881
882 assert_eq!(article.pmid, "33515491");
883 assert_eq!(
884 article.title,
885 "Resurgence of COVID-19 in Manaus, Brazil, despite high seroprevalence."
886 );
887 assert_eq!(article.journal, "Lancet (London, England)");
888 assert_eq!(article.authors.len(), 1);
889 assert_eq!(article.authors[0].full_name, "Ester C Sabino");
890 assert!(article.abstract_text.is_none());
891 }
892
893 #[test]
894 fn test_parse_invalid_xml() {
895 let invalid_xml = "<invalid>xml</not_closed>";
896 let result = parse_article_from_xml(invalid_xml, "12345");
897 assert!(result.is_err());
898 }
899
900 #[test]
901 fn test_parse_empty_xml() {
902 let empty_xml = r#"<?xml version="1.0" ?>
903 <PubmedArticleSet>
904 </PubmedArticleSet>"#;
905 let result = parse_article_from_xml(empty_xml, "12345");
906
907 assert!(
908 matches!(
909 result,
910 Err(ParseError::ArticleNotFound { ref pmid }) if pmid == "12345"
911 ),
912 "Expected ArticleNotFound error for PMID 12345"
913 );
914 }
915}