pubmed_parser/pubmed/parser/
mod.rs

1//! PubMed XML parser module
2//!
3//! This module provides functionality for parsing PubMed EFetch XML responses into
4//! structured article metadata. The parser handles complex XML structures including
5//! authors, affiliations, MeSH terms, chemicals, and structured abstracts.
6//!
7//! # Module Organization
8//!
9//! - `preprocessing` - XML cleaning and preparation
10//! - `deserializers` - Custom serde deserializers for complex fields
11//! - `extractors` - Data extraction utilities (email, country, names)
12//! - `xml_types` - Internal XML schema deserialization types
13//! - `converters` - Conversion from XML types to public API models
14//!
15//! # Public API
16//!
17//! The main entry point is [`parse_article_from_xml`], which takes a PubMed EFetch
18//! XML response and returns a [`PubMedArticle`].
19
20mod batch;
21mod converters;
22mod deserializers;
23mod extractors;
24mod preprocessing;
25mod xml_types;
26
27// Re-export preprocessing function for use by PMC parser
28pub(crate) use preprocessing::strip_inline_html_tags;
29
30// Re-export batch parsing function
31pub use batch::parse_articles_from_xml;
32
33use crate::error::{ParseError, Result};
34use crate::pubmed::models::PubMedArticle;
35use quick_xml::de::from_str;
36use tracing::instrument;
37use xml_types::PubmedArticleSet;
38
39/// Parse article from EFetch XML response
40///
41/// Parses a PubMed EFetch XML response and extracts article metadata.
42///
43/// # Arguments
44///
45/// * `xml` - The raw XML string from PubMed EFetch API
46/// * `pmid` - The PubMed ID of the article to extract
47///
48/// # Returns
49///
50/// A [`PubMedArticle`] containing the parsed metadata, or an error if parsing fails.
51///
52/// # Errors
53///
54/// Returns an error if:
55/// - The XML is malformed or doesn't match the expected schema
56/// - The specified PMID is not found in the XML
57/// - Required fields (like article title) are missing
58///
59/// # Example
60///
61/// ```ignore
62/// use pubmed_client::pubmed::parser::parse_article_from_xml;
63///
64/// let xml = r#"<?xml version="1.0"?>
65/// <PubmedArticleSet>
66///   <PubmedArticle>
67///     <MedlineCitation>
68///       <PMID>12345678</PMID>
69///       <Article>
70///         <ArticleTitle>Example Article</ArticleTitle>
71///         <Journal><Title>Example Journal</Title></Journal>
72///       </Article>
73///     </MedlineCitation>
74///   </PubmedArticle>
75/// </PubmedArticleSet>"#;
76///
77/// let article = parse_article_from_xml(xml, "12345678")?;
78/// assert_eq!(article.title, "Example Article");
79/// # Ok::<(), pubmed_client::error::PubMedError>(())
80/// ```
81#[instrument(skip(xml), fields(pmid = %pmid, xml_size = xml.len()))]
82pub fn parse_article_from_xml(xml: &str, pmid: &str) -> Result<PubMedArticle> {
83    // Preprocess XML to remove inline HTML tags that can cause parsing issues
84    // This handles tags like <i>, <sup>, <sub>, <b> that appear in abstracts and titles
85    let cleaned_xml = strip_inline_html_tags(xml);
86
87    // Parse the XML using quick-xml serde
88    let article_set: PubmedArticleSet = from_str(&cleaned_xml)
89        .map_err(|e| ParseError::XmlError(format!("Failed to deserialize XML: {}", e)))?;
90
91    // Find the article with the matching PMID
92    let article_xml = article_set
93        .articles
94        .into_iter()
95        .find(|a| {
96            a.medline_citation
97                .pmid
98                .as_ref()
99                .is_some_and(|p| p.value == pmid)
100        })
101        .ok_or_else(|| ParseError::ArticleNotFound {
102            pmid: pmid.to_string(),
103        })?;
104
105    article_xml.into_article(pmid)
106}
107
108#[cfg(test)]
109mod tests {
110    use super::*;
111
112    #[test]
113    fn test_mesh_term_parsing() {
114        let xml = r#"<?xml version="1.0" ?>
115<!DOCTYPE PubmedArticleSet PUBLIC "-//NLM//DTD PubMedArticle, 1st January 2023//EN" "https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_230101.dtd">
116<PubmedArticleSet>
117<PubmedArticle>
118    <MedlineCitation Status="PubMed-not-MEDLINE" Owner="NLM">
119        <PMID Version="1">12345678</PMID>
120        <Article>
121            <ArticleTitle>Test Article with MeSH Terms</ArticleTitle>
122            <Abstract>
123                <AbstractText>This is a test abstract.</AbstractText>
124            </Abstract>
125            <AuthorList>
126                <Author>
127                    <LastName>Doe</LastName>
128                    <ForeName>John</ForeName>
129                    <Initials>JA</Initials>
130                    <AffiliationInfo>
131                        <Affiliation>Department of Medicine, Harvard Medical School, Boston, MA, USA. john.doe@hms.harvard.edu</Affiliation>
132                    </AffiliationInfo>
133                    <Identifier Source="ORCID">0000-0001-2345-6789</Identifier>
134                </Author>
135            </AuthorList>
136            <Journal>
137                <Title>Test Journal</Title>
138            </Journal>
139        </Article>
140        <MeshHeadingList>
141            <MeshHeading>
142                <DescriptorName UI="D003920" MajorTopicYN="Y">Diabetes Mellitus</DescriptorName>
143                <QualifierName UI="Q000188" MajorTopicYN="N">drug therapy</QualifierName>
144            </MeshHeading>
145            <MeshHeading>
146                <DescriptorName UI="D007333" MajorTopicYN="N">Insulin</DescriptorName>
147            </MeshHeading>
148        </MeshHeadingList>
149        <ChemicalList>
150            <Chemical>
151                <RegistryNumber>11061-68-0</RegistryNumber>
152                <NameOfSubstance UI="D007328">Insulin</NameOfSubstance>
153            </Chemical>
154        </ChemicalList>
155        <KeywordList>
156            <Keyword>diabetes treatment</Keyword>
157            <Keyword>insulin therapy</Keyword>
158        </KeywordList>
159    </MedlineCitation>
160</PubmedArticle>
161</PubmedArticleSet>"#;
162
163        let article = parse_article_from_xml(xml, "12345678").unwrap();
164
165        // Test MeSH headings
166        assert!(article.mesh_headings.is_some());
167        let mesh_headings = article.mesh_headings.as_ref().unwrap();
168        assert_eq!(mesh_headings.len(), 2);
169
170        // Test first MeSH heading (major topic with qualifier)
171        let first_heading = &mesh_headings[0];
172        assert_eq!(first_heading.mesh_terms.len(), 1);
173        let diabetes_term = &first_heading.mesh_terms[0];
174        assert_eq!(diabetes_term.descriptor_name, "Diabetes Mellitus");
175        assert_eq!(diabetes_term.descriptor_ui, "D003920");
176        assert!(diabetes_term.major_topic);
177        assert_eq!(diabetes_term.qualifiers.len(), 1);
178        assert_eq!(diabetes_term.qualifiers[0].qualifier_name, "drug therapy");
179        assert_eq!(diabetes_term.qualifiers[0].qualifier_ui, "Q000188");
180        assert!(!diabetes_term.qualifiers[0].major_topic);
181
182        // Test second MeSH heading (non-major topic)
183        let second_heading = &mesh_headings[1];
184        assert_eq!(second_heading.mesh_terms.len(), 1);
185        let insulin_term = &second_heading.mesh_terms[0];
186        assert_eq!(insulin_term.descriptor_name, "Insulin");
187        assert_eq!(insulin_term.descriptor_ui, "D007333");
188        assert!(!insulin_term.major_topic);
189        assert_eq!(insulin_term.qualifiers.len(), 0);
190
191        // Test chemicals
192        assert!(article.chemical_list.is_some());
193        let chemicals = article.chemical_list.as_ref().unwrap();
194        assert_eq!(chemicals.len(), 1);
195        assert_eq!(chemicals[0].name, "Insulin");
196        assert_eq!(chemicals[0].registry_number, Some("11061-68-0".to_string()));
197        assert_eq!(chemicals[0].ui, Some("D007328".to_string()));
198
199        // Test author parsing
200        assert_eq!(article.authors.len(), 1);
201        assert_eq!(article.author_count, 1);
202        let author = &article.authors[0];
203        assert_eq!(author.surname, Some("Doe".to_string()));
204        assert_eq!(author.given_names, Some("John".to_string()));
205        assert_eq!(author.initials, Some("JA".to_string()));
206        assert_eq!(author.full_name, "John Doe");
207        assert_eq!(author.orcid, Some("0000-0001-2345-6789".to_string()));
208        assert_eq!(author.affiliations.len(), 1);
209        assert!(
210            author.affiliations[0]
211                .institution
212                .as_ref()
213                .unwrap()
214                .contains("Harvard Medical School")
215        );
216
217        // Test keywords
218        assert!(article.keywords.is_some());
219        let keywords = article.keywords.as_ref().unwrap();
220        assert_eq!(keywords.len(), 2);
221        assert_eq!(keywords[0], "diabetes treatment");
222        assert_eq!(keywords[1], "insulin therapy");
223    }
224
225    #[test]
226    fn test_structured_abstract_parsing() {
227        let xml = r#"
228        <PubmedArticleSet>
229            <PubmedArticle>
230                <MedlineCitation>
231                    <PMID>32887691</PMID>
232                    <Article>
233                        <ArticleTitle>A living WHO guideline on drugs for covid-19.</ArticleTitle>
234                        <Abstract>
235                            <AbstractText Label="UPDATES">This is the fourteenth version (thirteenth update) of the living guideline, replacing earlier versions.</AbstractText>
236                            <AbstractText Label="CLINICAL QUESTION">What is the role of drugs in the treatment of patients with covid-19?</AbstractText>
237                            <AbstractText Label="CONTEXT">The evidence base for therapeutics for covid-19 is evolving with numerous randomised controlled trials.</AbstractText>
238                        </Abstract>
239                        <Journal>
240                            <Title>BMJ (Clinical research ed.)</Title>
241                            <JournalIssue>
242                                <PubDate>
243                                    <Year>2020</Year>
244                                    <Month>Sep</Month>
245                                </PubDate>
246                            </JournalIssue>
247                        </Journal>
248                    </Article>
249                </MedlineCitation>
250            </PubmedArticle>
251        </PubmedArticleSet>"#;
252
253        let result = parse_article_from_xml(xml, "32887691");
254        assert!(result.is_ok());
255
256        let article = result.unwrap();
257        assert_eq!(article.pmid, "32887691");
258        assert_eq!(
259            article.title,
260            "A living WHO guideline on drugs for covid-19."
261        );
262
263        // Verify that all three abstract sections are concatenated
264        let abstract_text = article.abstract_text.unwrap();
265        assert!(abstract_text.contains("This is the fourteenth version"));
266        assert!(abstract_text.contains("What is the role of drugs"));
267        assert!(abstract_text.contains("The evidence base for therapeutics"));
268
269        // Verify they are properly concatenated with spaces
270        assert!(abstract_text.contains("earlier versions. What is the role"));
271        assert!(abstract_text.contains("covid-19? The evidence base"));
272    }
273
274    #[test]
275    fn test_abstract_with_inline_html_tags() {
276        // Test that abstracts with inline HTML tags (like <i>, <sub>, <sup>) parse successfully
277        // without errors. This was causing CI failures in Python tests.
278        let xml = r#"<?xml version="1.0" ?>
279<PubmedArticleSet>
280<PubmedArticle>
281    <MedlineCitation>
282        <PMID>41111388</PMID>
283        <Article>
284            <ArticleTitle>Breath analysis with inline formatting</ArticleTitle>
285            <Abstract>
286                <AbstractText>This study presents a novel approach (<i>e.g.</i>, machine learning algorithms) for comprehensive analysis. The method uses H<sub>2</sub>O and CO<sub>2</sub> detection with sensitivity of 10<sup>-9</sup> parts per billion.</AbstractText>
287            </Abstract>
288            <Journal>
289                <Title>Test Journal</Title>
290                <JournalIssue>
291                    <PubDate>
292                        <Year>2024</Year>
293                    </PubDate>
294                </JournalIssue>
295            </Journal>
296        </Article>
297    </MedlineCitation>
298</PubmedArticle>
299</PubmedArticleSet>"#;
300
301        // The critical test: parsing should succeed without errors
302        let result = parse_article_from_xml(xml, "41111388");
303        assert!(
304            result.is_ok(),
305            "Failed to parse XML with inline HTML tags: {:?}",
306            result
307        );
308
309        let article = result.unwrap();
310        assert_eq!(article.pmid, "41111388");
311
312        // Verify we extracted abstract text (even if some inline content might be lost)
313        let abstract_text = article.abstract_text.as_ref();
314        assert!(abstract_text.is_some(), "Abstract text should not be None");
315
316        let text = abstract_text.unwrap();
317
318        // Verify we get the main content (note: text from inline elements may be partially lost
319        // due to quick-xml's mixed content handling, but we should get surrounding text)
320        assert!(
321            text.contains("machine learning algorithms"),
322            "Abstract should contain main text content. Got: {}",
323            text
324        );
325        assert!(
326            text.contains("comprehensive analysis"),
327            "Abstract should contain regular text. Got: {}",
328            text
329        );
330        assert!(
331            text.contains("parts per billion"),
332            "Abstract should contain ending text. Got: {}",
333            text
334        );
335    }
336
337    #[test]
338    fn test_structured_abstract_with_inline_tags() {
339        // Test structured abstracts (with Label attributes) that also contain inline HTML tags
340        let xml = r#"<?xml version="1.0" ?>
341<PubmedArticleSet>
342<PubmedArticle>
343    <MedlineCitation>
344        <PMID>99999999</PMID>
345        <Article>
346            <ArticleTitle>Study with formatted abstract sections</ArticleTitle>
347            <Abstract>
348                <AbstractText Label="BACKGROUND">CRISPR-Cas systems (<i>e.g.</i>, Cas9) are revolutionary.</AbstractText>
349                <AbstractText Label="METHODS">We used <sup>13</sup>C isotope labeling and analyzed pH levels between 5.0-7.5.</AbstractText>
350                <AbstractText Label="RESULTS">Efficacy improved by 10<sup>3</sup>-fold with <i>in vitro</i> conditions.</AbstractText>
351            </Abstract>
352            <Journal>
353                <Title>Test Journal</Title>
354            </Journal>
355        </Article>
356    </MedlineCitation>
357</PubmedArticle>
358</PubmedArticleSet>"#;
359
360        let result = parse_article_from_xml(xml, "99999999");
361        assert!(
362            result.is_ok(),
363            "Failed to parse structured abstract with inline tags"
364        );
365
366        let article = result.unwrap();
367        let abstract_text = article.abstract_text.unwrap();
368
369        // Verify key content from labeled sections was extracted
370        assert!(
371            abstract_text.contains("CRISPR-Cas systems"),
372            "Should extract BACKGROUND content"
373        );
374        assert!(
375            abstract_text.contains("Cas9"),
376            "Should extract text adjacent to inline tags"
377        );
378        assert!(
379            abstract_text.contains("isotope labeling"),
380            "Should extract METHODS content"
381        );
382
383        // Verify multiple sections are present (sections should be concatenated)
384        assert!(
385            abstract_text.contains("revolutionary") && abstract_text.contains("isotope"),
386            "Should concatenate all sections"
387        );
388    }
389
390    #[test]
391    fn test_article_without_mesh_terms() {
392        let xml = r#"<?xml version="1.0" ?>
393<!DOCTYPE PubmedArticleSet PUBLIC "-//NLM//DTD PubMedArticle, 1st January 2023//EN" "https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_230101.dtd">
394<PubmedArticleSet>
395<PubmedArticle>
396    <MedlineCitation Status="PubMed-not-MEDLINE" Owner="NLM">
397        <PMID Version="1">87654321</PMID>
398        <Article>
399            <ArticleTitle>Article Without MeSH Terms</ArticleTitle>
400            <AuthorList>
401                <Author>
402                    <LastName>Smith</LastName>
403                    <ForeName>Jane</ForeName>
404                </Author>
405            </AuthorList>
406            <Journal>
407                <Title>Another Journal</Title>
408            </Journal>
409        </Article>
410    </MedlineCitation>
411</PubmedArticle>
412</PubmedArticleSet>"#;
413
414        let article = parse_article_from_xml(xml, "87654321").unwrap();
415
416        assert_eq!(article.authors.len(), 1);
417        assert_eq!(article.author_count, 1);
418        assert_eq!(article.authors[0].full_name, "Jane Smith");
419        assert!(article.mesh_headings.is_none());
420        assert!(article.chemical_list.is_none());
421        assert!(article.keywords.is_none());
422    }
423
424    // ==================================================================================
425    // Bibliographic field tests
426    //
427    // Strategy to guard against silent None on Optional fields:
428    //   (A) XML cross-check: if the raw XML contains a given element tag, the parsed
429    //       field MUST be Some — catches parser regressions that silently drop data.
430    //   (B) Statistical assertion: parse multiple articles that all have the fields in
431    //       XML, assert 100% extraction rate — catches systematic failures.
432    //   (C) Known-value assertion: hardcode expected values for specific test articles
433    //       — catches incorrect extraction or value corruption.
434    // ==================================================================================
435
436    /// Cross-check helper: if a given XML element tag is present in the source XML,
437    /// the corresponding parsed field must be Some (not silently dropped).
438    fn assert_xml_field_extracted(
439        xml: &str,
440        xml_tag: &str,
441        field_value: &Option<String>,
442        field_name: &str,
443    ) {
444        let tag_open = format!("<{}", xml_tag);
445        if xml.contains(&tag_open) {
446            assert!(
447                field_value.is_some(),
448                "XML contains <{}> but parsed `{}` is None — parser silently dropped the field",
449                xml_tag,
450                field_name,
451            );
452        }
453    }
454
455    /// Apply cross-check to all 6 bibliographic fields at once.
456    fn assert_bibliographic_fields_cross_check(xml: &str, article: &PubMedArticle) {
457        assert_xml_field_extracted(xml, "Volume", &article.volume, "volume");
458        assert_xml_field_extracted(xml, "Issue", &article.issue, "issue");
459        assert_xml_field_extracted(xml, "MedlinePgn", &article.pages, "pages");
460        assert_xml_field_extracted(xml, "Language", &article.language, "language");
461        assert_xml_field_extracted(
462            xml,
463            "ISOAbbreviation",
464            &article.journal_abbreviation,
465            "journal_abbreviation",
466        );
467        // ISSN tag can appear in other contexts; only check within Journal
468        if xml.contains("<ISSN") && xml.contains("<Journal>") {
469            assert!(
470                article.issn.is_some(),
471                "XML contains <ISSN> within <Journal> but parsed `issn` is None",
472            );
473        }
474    }
475
476    #[test]
477    fn test_bibliographic_fields_all_present() {
478        let xml = r#"<?xml version="1.0" ?>
479<PubmedArticleSet>
480<PubmedArticle>
481    <MedlineCitation>
482        <PMID>31978945</PMID>
483        <Article>
484            <Journal>
485                <ISSN IssnType="Electronic">1476-4687</ISSN>
486                <JournalIssue CitedMedium="Internet">
487                    <Volume>579</Volume>
488                    <Issue>7798</Issue>
489                    <PubDate>
490                        <Year>2020</Year>
491                        <Month>Mar</Month>
492                    </PubDate>
493                </JournalIssue>
494                <Title>Nature</Title>
495                <ISOAbbreviation>Nature</ISOAbbreviation>
496            </Journal>
497            <ArticleTitle>A pneumonia outbreak associated with a new coronavirus of probable bat origin.</ArticleTitle>
498            <Pagination>
499                <MedlinePgn>270-273</MedlinePgn>
500            </Pagination>
501            <Language>eng</Language>
502        </Article>
503    </MedlineCitation>
504</PubmedArticle>
505</PubmedArticleSet>"#;
506
507        let article = parse_article_from_xml(xml, "31978945").unwrap();
508
509        // (A) Cross-check: XML elements present → parsed field must be Some
510        assert_bibliographic_fields_cross_check(xml, &article);
511
512        // (C) Exact value assertions
513        assert_eq!(article.volume.as_deref(), Some("579"));
514        assert_eq!(article.issue.as_deref(), Some("7798"));
515        assert_eq!(article.pages.as_deref(), Some("270-273"));
516        assert_eq!(article.language.as_deref(), Some("eng"));
517        assert_eq!(article.journal_abbreviation.as_deref(), Some("Nature"));
518        assert_eq!(article.issn.as_deref(), Some("1476-4687"));
519    }
520
521    #[test]
522    fn test_bibliographic_fields_all_absent() {
523        let xml = r#"<?xml version="1.0" ?>
524<PubmedArticleSet>
525<PubmedArticle>
526    <MedlineCitation>
527        <PMID>99990001</PMID>
528        <Article>
529            <Journal>
530                <Title>Minimal Journal</Title>
531            </Journal>
532            <ArticleTitle>Minimal Article</ArticleTitle>
533        </Article>
534    </MedlineCitation>
535</PubmedArticle>
536</PubmedArticleSet>"#;
537
538        let article = parse_article_from_xml(xml, "99990001").unwrap();
539
540        // Cross-check still holds (no XML tags → None is correct)
541        assert_bibliographic_fields_cross_check(xml, &article);
542
543        // All fields must actually be None
544        assert!(article.volume.is_none());
545        assert!(article.issue.is_none());
546        assert!(article.pages.is_none());
547        assert!(article.language.is_none());
548        assert!(article.journal_abbreviation.is_none());
549        assert!(article.issn.is_none());
550    }
551
552    #[test]
553    fn test_bibliographic_fields_partial() {
554        // Volume + ISOAbbreviation + ISSN + Language present; Issue + Pagination absent
555        let xml = r#"<?xml version="1.0" ?>
556<PubmedArticleSet>
557<PubmedArticle>
558    <MedlineCitation>
559        <PMID>99990002</PMID>
560        <Article>
561            <Journal>
562                <ISSN IssnType="Print">0028-0836</ISSN>
563                <JournalIssue>
564                    <Volume>100</Volume>
565                    <PubDate>
566                        <Year>2023</Year>
567                    </PubDate>
568                </JournalIssue>
569                <Title>Test Journal of Medicine</Title>
570                <ISOAbbreviation>Test J Med</ISOAbbreviation>
571            </Journal>
572            <ArticleTitle>Partial Fields Article</ArticleTitle>
573            <Language>jpn</Language>
574        </Article>
575    </MedlineCitation>
576</PubmedArticle>
577</PubmedArticleSet>"#;
578
579        let article = parse_article_from_xml(xml, "99990002").unwrap();
580
581        // (A) Cross-check: present tags must yield Some
582        assert_bibliographic_fields_cross_check(xml, &article);
583
584        // (C) Exact values for present fields
585        assert_eq!(article.volume.as_deref(), Some("100"));
586        assert_eq!(article.language.as_deref(), Some("jpn"));
587        assert_eq!(article.journal_abbreviation.as_deref(), Some("Test J Med"));
588        assert_eq!(article.issn.as_deref(), Some("0028-0836"));
589
590        // Absent fields must be None
591        assert!(article.issue.is_none(), "No <Issue> in XML, must be None");
592        assert!(
593            article.pages.is_none(),
594            "No <MedlinePgn> in XML, must be None"
595        );
596    }
597
598    #[test]
599    fn test_bibliographic_fields_batch_extraction_rate() {
600        // (B) Statistical approach: parse 3 articles, all have all 6 fields in XML,
601        //     assert 100% extraction rate.  If any field silently returns None,
602        //     the count will be < 3 and the assertion fails.
603        let xml = r#"<?xml version="1.0" ?>
604<PubmedArticleSet>
605<PubmedArticle>
606    <MedlineCitation>
607        <PMID>10000001</PMID>
608        <Article>
609            <Journal>
610                <ISSN IssnType="Electronic">1111-2222</ISSN>
611                <JournalIssue>
612                    <Volume>10</Volume>
613                    <Issue>1</Issue>
614                    <PubDate><Year>2020</Year></PubDate>
615                </JournalIssue>
616                <Title>Journal Alpha</Title>
617                <ISOAbbreviation>J Alpha</ISOAbbreviation>
618            </Journal>
619            <ArticleTitle>Article One</ArticleTitle>
620            <Pagination><MedlinePgn>1-10</MedlinePgn></Pagination>
621            <Language>eng</Language>
622        </Article>
623    </MedlineCitation>
624</PubmedArticle>
625<PubmedArticle>
626    <MedlineCitation>
627        <PMID>10000002</PMID>
628        <Article>
629            <Journal>
630                <ISSN IssnType="Print">3333-4444</ISSN>
631                <JournalIssue>
632                    <Volume>25</Volume>
633                    <Issue>12</Issue>
634                    <PubDate><Year>2021</Year><Month>Dec</Month></PubDate>
635                </JournalIssue>
636                <Title>Journal Beta</Title>
637                <ISOAbbreviation>J Beta</ISOAbbreviation>
638            </Journal>
639            <ArticleTitle>Article Two</ArticleTitle>
640            <Pagination><MedlinePgn>100-115</MedlinePgn></Pagination>
641            <Language>fre</Language>
642        </Article>
643    </MedlineCitation>
644</PubmedArticle>
645<PubmedArticle>
646    <MedlineCitation>
647        <PMID>10000003</PMID>
648        <Article>
649            <Journal>
650                <ISSN IssnType="Electronic">5555-6666</ISSN>
651                <JournalIssue>
652                    <Volume>8</Volume>
653                    <Issue>4</Issue>
654                    <PubDate><Year>2023</Year><Month>Apr</Month><Day>01</Day></PubDate>
655                </JournalIssue>
656                <Title>Journal Gamma</Title>
657                <ISOAbbreviation>J Gamma</ISOAbbreviation>
658            </Journal>
659            <ArticleTitle>Article Three</ArticleTitle>
660            <Pagination><MedlinePgn>e2023001</MedlinePgn></Pagination>
661            <Language>jpn</Language>
662        </Article>
663    </MedlineCitation>
664</PubmedArticle>
665</PubmedArticleSet>"#;
666
667        use crate::pubmed::parser::batch::parse_articles_from_xml;
668        let articles = parse_articles_from_xml(xml).unwrap();
669        assert_eq!(articles.len(), 3, "Should parse all 3 articles");
670
671        let mut counts = [0u32; 6]; // volume, issue, pages, language, abbreviation, issn
672
673        for article in &articles {
674            if article.volume.is_some() {
675                counts[0] += 1;
676            }
677            if article.issue.is_some() {
678                counts[1] += 1;
679            }
680            if article.pages.is_some() {
681                counts[2] += 1;
682            }
683            if article.language.is_some() {
684                counts[3] += 1;
685            }
686            if article.journal_abbreviation.is_some() {
687                counts[4] += 1;
688            }
689            if article.issn.is_some() {
690                counts[5] += 1;
691            }
692        }
693
694        let n = articles.len() as u32;
695        let field_names = [
696            "volume",
697            "issue",
698            "pages",
699            "language",
700            "journal_abbreviation",
701            "issn",
702        ];
703        for (i, name) in field_names.iter().enumerate() {
704            assert_eq!(
705                counts[i], n,
706                "All {} articles have <{}> in XML but only {} were extracted (expected {})",
707                n, name, counts[i], n,
708            );
709        }
710
711        // (C) Spot-check exact values on first and last article
712        let a1 = articles.iter().find(|a| a.pmid == "10000001").unwrap();
713        assert_eq!(a1.volume.as_deref(), Some("10"));
714        assert_eq!(a1.issue.as_deref(), Some("1"));
715        assert_eq!(a1.pages.as_deref(), Some("1-10"));
716        assert_eq!(a1.language.as_deref(), Some("eng"));
717        assert_eq!(a1.journal_abbreviation.as_deref(), Some("J Alpha"));
718        assert_eq!(a1.issn.as_deref(), Some("1111-2222"));
719
720        let a3 = articles.iter().find(|a| a.pmid == "10000003").unwrap();
721        assert_eq!(a3.volume.as_deref(), Some("8"));
722        assert_eq!(a3.issue.as_deref(), Some("4"));
723        assert_eq!(a3.pages.as_deref(), Some("e2023001"));
724        assert_eq!(a3.language.as_deref(), Some("jpn"));
725        assert_eq!(a3.journal_abbreviation.as_deref(), Some("J Gamma"));
726        assert_eq!(a3.issn.as_deref(), Some("5555-6666"));
727    }
728
729    #[test]
730    fn test_bibliographic_fields_nlm_citation() {
731        // End-to-end: parse → cross-check → construct NLM citation string
732        let xml = r#"<?xml version="1.0" ?>
733<PubmedArticleSet>
734<PubmedArticle>
735    <MedlineCitation>
736        <PMID>99990003</PMID>
737        <Article>
738            <Journal>
739                <ISSN IssnType="Electronic">1234-5678</ISSN>
740                <JournalIssue CitedMedium="Internet">
741                    <Volume>45</Volume>
742                    <Issue>3</Issue>
743                    <PubDate>
744                        <Year>2024</Year>
745                        <Month>Jun</Month>
746                        <Day>15</Day>
747                    </PubDate>
748                </JournalIssue>
749                <Title>Journal of Biological Chemistry</Title>
750                <ISOAbbreviation>J Biol Chem</ISOAbbreviation>
751            </Journal>
752            <ArticleTitle>Complete Citation Test Article.</ArticleTitle>
753            <Pagination>
754                <MedlinePgn>e100234</MedlinePgn>
755            </Pagination>
756            <AuthorList>
757                <Author>
758                    <LastName>Tanaka</LastName>
759                    <ForeName>Yuki</ForeName>
760                </Author>
761                <Author>
762                    <LastName>Suzuki</LastName>
763                    <ForeName>Kenji</ForeName>
764                </Author>
765            </AuthorList>
766            <Language>eng</Language>
767        </Article>
768    </MedlineCitation>
769</PubmedArticle>
770</PubmedArticleSet>"#;
771
772        let article = parse_article_from_xml(xml, "99990003").unwrap();
773
774        // (A) Cross-check
775        assert_bibliographic_fields_cross_check(xml, &article);
776
777        // Construct NLM citation and verify
778        let citation = format!(
779            "{}. {}. {} {};{}({}):{}.",
780            article.authors[0].full_name,
781            article.title,
782            article
783                .journal_abbreviation
784                .as_deref()
785                .unwrap_or(&article.journal),
786            "2024",
787            article.volume.as_deref().unwrap_or(""),
788            article.issue.as_deref().unwrap_or(""),
789            article.pages.as_deref().unwrap_or(""),
790        );
791        assert_eq!(
792            citation,
793            "Yuki Tanaka. Complete Citation Test Article.. J Biol Chem 2024;45(3):e100234."
794        );
795    }
796
797    #[test]
798    fn test_parse_article_with_abstract() {
799        let xml = r#"<?xml version="1.0" ?>
800<!DOCTYPE PubmedArticleSet PUBLIC "-//NLM//DTD PubMedArticle, 1st January 2025//EN" "https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_250101.dtd">
801<PubmedArticleSet>
802<PubmedArticle>
803<MedlineCitation Status="MEDLINE" Owner="NLM" IndexingMethod="Manual">
804<PMID Version="1">31978945</PMID>
805<Article PubModel="Print-Electronic">
806<Journal>
807<Title>The New England journal of medicine</Title>
808</Journal>
809<ArticleTitle>A Novel Coronavirus from Patients with Pneumonia in China, 2019.</ArticleTitle>
810<Abstract>
811<AbstractText>In December 2019, a cluster of patients with pneumonia of unknown cause was linked to a seafood wholesale market in Wuhan, China. A previously unknown betacoronavirus was discovered through the use of unbiased sequencing in samples from patients with pneumonia. Human airway epithelial cells were used to isolate a novel coronavirus, named 2019-nCoV, which formed a clade within the subgenus sarbecovirus, Orthocoronavirinae subfamily. Different from both MERS-CoV and SARS-CoV, 2019-nCoV is the seventh member of the family of coronaviruses that infect humans. Enhanced surveillance and further investigation are ongoing. (Funded by the National Key Research and Development Program of China and the National Major Project for Control and Prevention of Infectious Disease in China.).</AbstractText>
812</Abstract>
813<AuthorList CompleteYN="Y">
814<Author ValidYN="Y">
815<LastName>Zhu</LastName>
816<ForeName>Na</ForeName>
817</Author>
818<Author ValidYN="Y">
819<LastName>Zhang</LastName>
820<ForeName>Dingyu</ForeName>
821</Author>
822</AuthorList>
823<PublicationTypeList>
824<PublicationType UI="D016428">Journal Article</PublicationType>
825</PublicationTypeList>
826</Article>
827</MedlineCitation>
828</PubmedArticle>
829</PubmedArticleSet>"#;
830
831        let article = parse_article_from_xml(xml, "31978945").unwrap();
832
833        assert_eq!(article.pmid, "31978945");
834        assert_eq!(
835            article.title,
836            "A Novel Coronavirus from Patients with Pneumonia in China, 2019."
837        );
838        assert_eq!(article.journal, "The New England journal of medicine");
839        assert_eq!(article.authors.len(), 2);
840        assert_eq!(article.authors[0].full_name, "Na Zhu");
841        assert_eq!(article.authors[1].full_name, "Dingyu Zhang");
842        assert_eq!(article.article_types, vec!["Journal Article"]);
843
844        assert!(article.abstract_text.is_some());
845        let abstract_text = article.abstract_text.unwrap();
846        assert!(abstract_text.contains("In December 2019"));
847        assert!(abstract_text.contains("2019-nCoV"));
848        assert!(
849            abstract_text.contains("Enhanced surveillance and further investigation are ongoing")
850        );
851    }
852
853    #[test]
854    fn test_parse_article_without_abstract() {
855        let xml = r#"<?xml version="1.0" ?>
856<!DOCTYPE PubmedArticleSet PUBLIC "-//NLM//DTD PubMedArticle, 1st January 2025//EN" "https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_250101.dtd">
857<PubmedArticleSet>
858<PubmedArticle>
859<MedlineCitation Status="MEDLINE" Owner="NLM" IndexingMethod="Manual">
860<PMID Version="1">33515491</PMID>
861<Article PubModel="Print-Electronic">
862<Journal>
863<Title>Lancet (London, England)</Title>
864</Journal>
865<ArticleTitle>Resurgence of COVID-19 in Manaus, Brazil, despite high seroprevalence.</ArticleTitle>
866<AuthorList CompleteYN="Y">
867<Author ValidYN="Y">
868<LastName>Sabino</LastName>
869<ForeName>Ester C</ForeName>
870</Author>
871</AuthorList>
872<PublicationTypeList>
873<PublicationType UI="D016428">Journal Article</PublicationType>
874</PublicationTypeList>
875</Article>
876</MedlineCitation>
877</PubmedArticle>
878</PubmedArticleSet>"#;
879
880        let article = parse_article_from_xml(xml, "33515491").unwrap();
881
882        assert_eq!(article.pmid, "33515491");
883        assert_eq!(
884            article.title,
885            "Resurgence of COVID-19 in Manaus, Brazil, despite high seroprevalence."
886        );
887        assert_eq!(article.journal, "Lancet (London, England)");
888        assert_eq!(article.authors.len(), 1);
889        assert_eq!(article.authors[0].full_name, "Ester C Sabino");
890        assert!(article.abstract_text.is_none());
891    }
892
893    #[test]
894    fn test_parse_invalid_xml() {
895        let invalid_xml = "<invalid>xml</not_closed>";
896        let result = parse_article_from_xml(invalid_xml, "12345");
897        assert!(result.is_err());
898    }
899
900    #[test]
901    fn test_parse_empty_xml() {
902        let empty_xml = r#"<?xml version="1.0" ?>
903    <PubmedArticleSet>
904    </PubmedArticleSet>"#;
905        let result = parse_article_from_xml(empty_xml, "12345");
906
907        assert!(
908            matches!(
909                result,
910                Err(ParseError::ArticleNotFound { ref pmid }) if pmid == "12345"
911            ),
912            "Expected ArticleNotFound error for PMID 12345"
913        );
914    }
915}
pubmed_parser/pubmed/parser/mod.rs

pubmed_parser/pubmed/parser/
mod.rs