pubmed_parser/pmc/parser/
mod.rs

1use crate::common::{PmcId, PubMedId};
2use crate::error::Result;
3use crate::pmc::domain::{
4    Abstract, ArticleMeta, Back, Body, Front, License, Permissions, PmcArticle, TitleGroup,
5};
6
7pub mod author;
8pub mod metadata;
9pub(crate) mod reader_utils;
10pub mod reference;
11pub mod section;
12pub mod xml_utils;
13
14/// Extract a section slice from XML content without allocating.
15///
16/// Returns a `&str` slice covering `start_tag..end_tag` (inclusive),
17/// or `None` if the tags are not found.
18fn extract_section_slice<'a>(content: &'a str, start_tag: &str, end_tag: &str) -> Option<&'a str> {
19    let start = content.find(start_tag)?;
20    let end_offset = content[start..].find(end_tag)?;
21    Some(&content[start..start + end_offset + end_tag.len()])
22}
23
24/// Parse PMC XML content into a [`PmcArticle`] domain model.
25///
26/// This function acts as a coordinator that delegates parsing tasks
27/// to specialized parser modules for better maintainability and separation of concerns.
28/// It directly produces domain types without going through legacy intermediate models.
29pub fn parse_pmc_xml(xml_content: &str, pmcid: &str) -> Result<PmcArticle> {
30    let pmcid_typed = PmcId::parse(pmcid)?;
31
32    // Pre-extract major XML sections once to avoid scanning the full document repeatedly.
33    // PMC JATS XML structure: <article> <front>...</front> <body>...</body> <back>...</back> </article>
34    let front = extract_section_slice(xml_content, "<front>", "</front>").unwrap_or(xml_content);
35    let back = extract_section_slice(xml_content, "<back>", "</back>").unwrap_or("");
36
37    // Metadata from <front> (title, journal, dates, IDs, keywords, funding are all in <front>)
38    let title = metadata::extract_title(front);
39    let subtitle = metadata::extract_subtitle(front);
40    let journal = metadata::extract_journal_info(front);
41    let pub_dates = metadata::extract_pub_dates(front);
42    let volume = metadata::extract_volume(front);
43    let issue = metadata::extract_issue(front);
44    let doi = metadata::extract_doi(front);
45    let pmid_str = metadata::extract_pmid(front);
46    let pmid = pmid_str.as_deref().map(PubMedId::parse).transpose()?;
47    let keywords = metadata::extract_keywords(front);
48    let funding = metadata::extract_funding(front);
49
50    // Additional metadata from <front>
51    let abstract_text = metadata::extract_abstract(front);
52    let copyright = metadata::extract_copyright(front);
53    let license = metadata::extract_license(front);
54    let license_url = metadata::extract_license_url(front);
55    let history_dates = metadata::extract_history_dates(front);
56    let categories = metadata::extract_categories(front);
57    let fpage = metadata::extract_fpage(front);
58    let lpage = metadata::extract_lpage(front);
59    let elocation_id = metadata::extract_elocation_id(front);
60
61    // Article type is an attribute on the <article> tag itself (before <front>)
62    let article_type = metadata::extract_article_type(xml_content);
63
64    // Back matter
65    let conflict_of_interest = metadata::extract_conflict_of_interest(back);
66    let acknowledgments = metadata::extract_acknowledgments(back);
67
68    // These can appear in body or back, so search full content
69    let data_availability = metadata::extract_data_availability(xml_content);
70    let supplementary_materials = metadata::extract_supplementary_materials(xml_content);
71
72    // Authors from <front> (contrib-group is in article-meta)
73    let authors = author::extract_authors(front)?;
74
75    // Sections from <body> (extract_sections_enhanced finds <body> internally)
76    let sections = section::extract_sections_enhanced(xml_content);
77
78    // References from <back> (extract_references_detailed finds <ref-list>/<back> internally)
79    let references = reference::extract_references_detailed(xml_content).unwrap_or_default();
80
81    // Assemble <permissions>: copyright + license
82    let license = if license.is_some() || license_url.is_some() {
83        Some(License {
84            href: license_url,
85            text: license,
86        })
87    } else {
88        None
89    };
90    let permissions = if copyright.is_some() || license.is_some() {
91        Some(Permissions {
92            copyright_statement: copyright,
93            license,
94        })
95    } else {
96        None
97    };
98
99    // Assemble <abstract>* (structured abstract sections not yet extracted)
100    let abstracts = abstract_text
101        .map(|text| {
102            vec![Abstract {
103                abstract_type: None,
104                text,
105                sections: Vec::new(),
106            }]
107        })
108        .unwrap_or_default();
109
110    let front = Front {
111        journal_meta: journal,
112        article_meta: ArticleMeta {
113            pmcid: pmcid_typed,
114            pmid,
115            doi,
116            categories,
117            title_group: TitleGroup {
118                article_title: title,
119                subtitle,
120            },
121            authors,
122            pub_dates,
123            volume,
124            issue,
125            fpage,
126            lpage,
127            elocation_id,
128            history: history_dates,
129            permissions,
130            abstracts,
131            keywords,
132            funding,
133        },
134    };
135
136    let body = if xml_content.contains("<body") || !sections.is_empty() {
137        Some(Body { sections })
138    } else {
139        None
140    };
141
142    let has_back_content =
143        acknowledgments.is_some() || conflict_of_interest.is_some() || !references.is_empty();
144    let back = if !back.is_empty() || has_back_content {
145        Some(Back {
146            acknowledgments,
147            conflict_of_interest,
148            references,
149            appendices: Vec::new(),
150            glossary: Vec::new(),
151        })
152    } else {
153        None
154    };
155
156    Ok(PmcArticle {
157        article_type,
158        front,
159        body,
160        back,
161        supplementary_materials,
162        data_availability,
163    })
164}
165
166#[cfg(test)]
167mod tests {
168    use super::*;
169
170    #[test]
171    fn test_parse_basic_structure() {
172        // Test that the parse method successfully delegates to specialized parsers
173        let xml_content = r#"
174        <article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article">
175            <front>
176                <article-meta>
177                    <article-id pub-id-type="pmc">PMC123456</article-id>
178                    <article-id pub-id-type="doi">10.1234/test</article-id>
179                    <title-group>
180                        <article-title>Test Article Title</article-title>
181                    </title-group>
182                    <contrib-group>
183                        <contrib>
184                            <name>
185                                <surname>Doe</surname>
186                                <given-names>John</given-names>
187                            </name>
188                        </contrib>
189                    </contrib-group>
190                    <pub-date>
191                        <year>2023</year>
192                        <month>12</month>
193                        <day>25</day>
194                    </pub-date>
195                </article-meta>
196            </front>
197            <body>
198                <sec>
199                    <title>Introduction</title>
200                    <p>This is the introduction.</p>
201                </sec>
202            </body>
203            <back>
204                <ref-list>
205                    <ref id="ref1">
206                        <element-citation>
207                            <article-title>Reference Title</article-title>
208                        </element-citation>
209                    </ref>
210                </ref-list>
211            </back>
212        </article>
213        "#;
214
215        let result = parse_pmc_xml(xml_content, "PMC123456");
216        assert!(result.is_ok());
217
218        let article = result.unwrap();
219        assert_eq!(article.pmcid().as_str(), "PMC123456");
220        assert_eq!(article.title(), "Test Article Title");
221        assert!(!article.pub_dates().is_empty());
222        assert_eq!(article.pub_dates()[0].year, Some(2023));
223        assert_eq!(article.pub_dates()[0].month, Some(12));
224        assert_eq!(article.pub_dates()[0].day, Some(25));
225        assert!(!article.authors().is_empty());
226        assert!(!article.sections().is_empty());
227        assert!(!article.references().is_empty());
228    }
229
230    #[test]
231    fn test_parse_minimal_xml() {
232        // Test parsing with minimal XML structure
233        let xml_content = r#"
234        <article>
235            <front>
236                <article-meta>
237                    <title-group>
238                        <article-title>Minimal Test</article-title>
239                    </title-group>
240                </article-meta>
241            </front>
242        </article>
243        "#;
244
245        let result = parse_pmc_xml(xml_content, "PMC100000");
246        assert!(result.is_ok());
247
248        let article = result.unwrap();
249        assert_eq!(article.pmcid().as_str(), "PMC100000");
250        assert_eq!(article.title(), "Minimal Test");
251    }
252
253    // Note: Most detailed tests have been moved to the individual parser modules:
254    // - AuthorParser tests in author_parser.rs
255    // - section module functions tests in section.rs
256    // - reference module functions tests in reference.rs
257    // - metadata module functions tests in metadata.rs
258    // - XmlUtils tests in xml_utils.rs
259}