pubmed_parser/pmc/parser/
mod.rs1use crate::common::{PmcId, PubMedId};
2use crate::error::Result;
3use crate::pmc::domain::{
4 Abstract, ArticleMeta, Back, Body, Front, License, Permissions, PmcArticle, TitleGroup,
5};
6
7pub mod author;
8pub mod metadata;
9pub(crate) mod reader_utils;
10pub mod reference;
11pub mod section;
12pub mod xml_utils;
13
14fn extract_section_slice<'a>(content: &'a str, start_tag: &str, end_tag: &str) -> Option<&'a str> {
19 let start = content.find(start_tag)?;
20 let end_offset = content[start..].find(end_tag)?;
21 Some(&content[start..start + end_offset + end_tag.len()])
22}
23
24pub fn parse_pmc_xml(xml_content: &str, pmcid: &str) -> Result<PmcArticle> {
30 let pmcid_typed = PmcId::parse(pmcid)?;
31
32 let front = extract_section_slice(xml_content, "<front>", "</front>").unwrap_or(xml_content);
35 let back = extract_section_slice(xml_content, "<back>", "</back>").unwrap_or("");
36
37 let title = metadata::extract_title(front);
39 let subtitle = metadata::extract_subtitle(front);
40 let journal = metadata::extract_journal_info(front);
41 let pub_dates = metadata::extract_pub_dates(front);
42 let volume = metadata::extract_volume(front);
43 let issue = metadata::extract_issue(front);
44 let doi = metadata::extract_doi(front);
45 let pmid_str = metadata::extract_pmid(front);
46 let pmid = pmid_str.as_deref().map(PubMedId::parse).transpose()?;
47 let keywords = metadata::extract_keywords(front);
48 let funding = metadata::extract_funding(front);
49
50 let abstract_text = metadata::extract_abstract(front);
52 let copyright = metadata::extract_copyright(front);
53 let license = metadata::extract_license(front);
54 let license_url = metadata::extract_license_url(front);
55 let history_dates = metadata::extract_history_dates(front);
56 let categories = metadata::extract_categories(front);
57 let fpage = metadata::extract_fpage(front);
58 let lpage = metadata::extract_lpage(front);
59 let elocation_id = metadata::extract_elocation_id(front);
60
61 let article_type = metadata::extract_article_type(xml_content);
63
64 let conflict_of_interest = metadata::extract_conflict_of_interest(back);
66 let acknowledgments = metadata::extract_acknowledgments(back);
67
68 let data_availability = metadata::extract_data_availability(xml_content);
70 let supplementary_materials = metadata::extract_supplementary_materials(xml_content);
71
72 let authors = author::extract_authors(front)?;
74
75 let sections = section::extract_sections_enhanced(xml_content);
77
78 let references = reference::extract_references_detailed(xml_content).unwrap_or_default();
80
81 let license = if license.is_some() || license_url.is_some() {
83 Some(License {
84 href: license_url,
85 text: license,
86 })
87 } else {
88 None
89 };
90 let permissions = if copyright.is_some() || license.is_some() {
91 Some(Permissions {
92 copyright_statement: copyright,
93 license,
94 })
95 } else {
96 None
97 };
98
99 let abstracts = abstract_text
101 .map(|text| {
102 vec![Abstract {
103 abstract_type: None,
104 text,
105 sections: Vec::new(),
106 }]
107 })
108 .unwrap_or_default();
109
110 let front = Front {
111 journal_meta: journal,
112 article_meta: ArticleMeta {
113 pmcid: pmcid_typed,
114 pmid,
115 doi,
116 categories,
117 title_group: TitleGroup {
118 article_title: title,
119 subtitle,
120 },
121 authors,
122 pub_dates,
123 volume,
124 issue,
125 fpage,
126 lpage,
127 elocation_id,
128 history: history_dates,
129 permissions,
130 abstracts,
131 keywords,
132 funding,
133 },
134 };
135
136 let body = if xml_content.contains("<body") || !sections.is_empty() {
137 Some(Body { sections })
138 } else {
139 None
140 };
141
142 let has_back_content =
143 acknowledgments.is_some() || conflict_of_interest.is_some() || !references.is_empty();
144 let back = if !back.is_empty() || has_back_content {
145 Some(Back {
146 acknowledgments,
147 conflict_of_interest,
148 references,
149 appendices: Vec::new(),
150 glossary: Vec::new(),
151 })
152 } else {
153 None
154 };
155
156 Ok(PmcArticle {
157 article_type,
158 front,
159 body,
160 back,
161 supplementary_materials,
162 data_availability,
163 })
164}
165
166#[cfg(test)]
167mod tests {
168 use super::*;
169
170 #[test]
171 fn test_parse_basic_structure() {
172 let xml_content = r#"
174 <article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article">
175 <front>
176 <article-meta>
177 <article-id pub-id-type="pmc">PMC123456</article-id>
178 <article-id pub-id-type="doi">10.1234/test</article-id>
179 <title-group>
180 <article-title>Test Article Title</article-title>
181 </title-group>
182 <contrib-group>
183 <contrib>
184 <name>
185 <surname>Doe</surname>
186 <given-names>John</given-names>
187 </name>
188 </contrib>
189 </contrib-group>
190 <pub-date>
191 <year>2023</year>
192 <month>12</month>
193 <day>25</day>
194 </pub-date>
195 </article-meta>
196 </front>
197 <body>
198 <sec>
199 <title>Introduction</title>
200 <p>This is the introduction.</p>
201 </sec>
202 </body>
203 <back>
204 <ref-list>
205 <ref id="ref1">
206 <element-citation>
207 <article-title>Reference Title</article-title>
208 </element-citation>
209 </ref>
210 </ref-list>
211 </back>
212 </article>
213 "#;
214
215 let result = parse_pmc_xml(xml_content, "PMC123456");
216 assert!(result.is_ok());
217
218 let article = result.unwrap();
219 assert_eq!(article.pmcid().as_str(), "PMC123456");
220 assert_eq!(article.title(), "Test Article Title");
221 assert!(!article.pub_dates().is_empty());
222 assert_eq!(article.pub_dates()[0].year, Some(2023));
223 assert_eq!(article.pub_dates()[0].month, Some(12));
224 assert_eq!(article.pub_dates()[0].day, Some(25));
225 assert!(!article.authors().is_empty());
226 assert!(!article.sections().is_empty());
227 assert!(!article.references().is_empty());
228 }
229
230 #[test]
231 fn test_parse_minimal_xml() {
232 let xml_content = r#"
234 <article>
235 <front>
236 <article-meta>
237 <title-group>
238 <article-title>Minimal Test</article-title>
239 </title-group>
240 </article-meta>
241 </front>
242 </article>
243 "#;
244
245 let result = parse_pmc_xml(xml_content, "PMC100000");
246 assert!(result.is_ok());
247
248 let article = result.unwrap();
249 assert_eq!(article.pmcid().as_str(), "PMC100000");
250 assert_eq!(article.title(), "Minimal Test");
251 }
252
253 }