1use crate::common::Author;
2use crate::common::xml_utils::strip_inline_html_tags;
3use crate::error::Result;
4use crate::pmc::domain::Reference;
5use quick_xml::de::from_str;
6use serde::Deserialize;
7use tracing;
8
9#[derive(Debug, Deserialize)]
11#[serde(rename = "ref-list")]
12struct RefList {
13 #[serde(rename = "@id", default)]
14 #[allow(dead_code)]
15 id: Option<String>,
16
17 #[serde(rename = "title", default)]
18 #[allow(dead_code)]
19 title: Option<String>,
20
21 #[serde(rename = "ref", default)]
22 refs: Vec<Ref>,
23}
24
25#[derive(Debug, Deserialize)]
27struct Ref {
28 #[serde(rename = "@id")]
29 id: Option<String>,
30
31 #[serde(rename = "label", default)]
32 #[allow(dead_code)]
33 label: Option<String>,
34
35 #[serde(rename = "element-citation", default)]
36 element_citation: Option<ElementCitation>,
37
38 #[serde(rename = "mixed-citation", default)]
39 mixed_citation: Option<MixedCitation>,
40}
41
42#[derive(Debug, Deserialize)]
44#[serde(rename = "element-citation")]
45struct ElementCitation {
46 #[serde(rename = "@publication-type")]
47 publication_type: Option<String>,
48
49 #[serde(rename = "@id", default)]
50 #[allow(dead_code)]
51 citation_id: Option<String>,
52
53 #[serde(rename = "article-title", default)]
54 article_title: Option<String>,
55
56 #[serde(rename = "source", default)]
57 source: Option<String>,
58
59 #[serde(rename = "year", default)]
60 year: Option<String>,
61
62 #[serde(rename = "volume", default)]
63 volume: Option<String>,
64
65 #[serde(rename = "issue", default)]
66 issue: Option<String>,
67
68 #[serde(rename = "fpage", default)]
69 fpage: Option<String>,
70
71 #[serde(rename = "lpage", default)]
72 lpage: Option<String>,
73
74 #[serde(rename = "pub-id", default)]
75 pub_ids: Vec<PubId>,
76
77 #[serde(rename = "person-group", default)]
78 person_groups: Vec<PersonGroup>,
79}
80
81#[derive(Debug, Deserialize)]
83#[serde(rename = "mixed-citation")]
84struct MixedCitation {
85 #[serde(rename = "@publication-type")]
86 publication_type: Option<String>,
87
88 #[serde(rename = "@id", default)]
89 #[allow(dead_code)]
90 citation_id: Option<String>,
91
92 #[serde(rename = "article-title", default)]
93 article_title: Option<String>,
94
95 #[serde(rename = "source", default)]
96 source: Option<String>,
97
98 #[serde(rename = "year", default)]
99 year: Option<String>,
100
101 #[serde(rename = "volume", default)]
102 volume: Option<String>,
103
104 #[serde(rename = "issue", default)]
105 issue: Option<String>,
106
107 #[serde(rename = "fpage", default)]
108 fpage: Option<String>,
109
110 #[serde(rename = "lpage", default)]
111 lpage: Option<String>,
112
113 #[serde(rename = "pub-id", default)]
114 pub_ids: Vec<PubId>,
115
116 #[serde(rename = "person-group", default)]
117 person_groups: Vec<PersonGroup>,
118}
119
120#[derive(Debug, Deserialize)]
122struct PubId {
123 #[serde(rename = "@pub-id-type")]
124 pub_id_type: Option<String>,
125
126 #[serde(rename = "$text")]
127 value: Option<String>,
128}
129
130#[derive(Debug, Deserialize)]
132#[serde(rename = "person-group")]
133struct PersonGroup {
134 #[serde(rename = "@person-group-type")]
135 person_group_type: Option<String>,
136
137 #[serde(rename = "name", default)]
138 names: Vec<Name>,
139
140 #[serde(rename = "etal", default)]
141 #[allow(dead_code)]
142 etal: Option<String>,
143
144 #[serde(rename = "collab", default)]
145 #[allow(dead_code)]
146 collab: Option<String>,
147}
148
149#[derive(Debug, Deserialize)]
151struct Name {
152 #[serde(rename = "@name-style", default)]
153 #[allow(dead_code)]
154 name_style: Option<String>,
155
156 #[serde(rename = "surname", default)]
157 surname: Option<String>,
158
159 #[serde(rename = "given-names", default)]
160 given_names: Option<String>,
161
162 #[serde(rename = "suffix", default)]
163 #[allow(dead_code)]
164 suffix: Option<String>,
165}
166
167fn strip_comment_tags(content: &str) -> String {
172 use regex::Regex;
173 use std::sync::OnceLock;
174
175 static COMMENT_RE: OnceLock<Regex> = OnceLock::new();
176 let re =
177 COMMENT_RE.get_or_init(|| Regex::new(r"<comment[^>]*>.*?</comment>").expect("valid regex"));
178 re.replace_all(content, "").into_owned()
179}
180
181pub fn extract_references_detailed(content: &str) -> Result<Vec<Reference>> {
183 if let Some(references) = try_extract_from_ref_list(content)? {
187 tracing::debug!(
188 count = references.len(),
189 "Extracted references from ref-list"
190 );
191 return Ok(references);
192 }
193
194 if let Some(references) = try_extract_from_references_tag(content)? {
196 tracing::debug!(
197 count = references.len(),
198 "Extracted references from references tag"
199 );
200 return Ok(references);
201 }
202
203 if let Some(references) = try_extract_from_back_section(content)? {
205 tracing::debug!(
206 count = references.len(),
207 "Extracted references from back section"
208 );
209 return Ok(references);
210 }
211
212 Ok(Vec::new())
214}
215
216fn try_extract_from_ref_list(content: &str) -> Result<Option<Vec<Reference>>> {
218 let ref_list_content = if let Some(start) = content.find("<ref-list") {
219 if let Some(end) = content[start..].find("</ref-list>") {
220 &content[start..start + end + 11] } else {
222 return Ok(None);
223 }
224 } else {
225 return Ok(None);
226 };
227
228 let cleaned_content = strip_inline_html_tags(ref_list_content);
230 let cleaned_content = strip_comment_tags(&cleaned_content);
231 match from_str::<RefList>(&cleaned_content) {
232 Ok(ref_list) => {
233 let references = ref_list
234 .refs
235 .into_iter()
236 .filter_map(parse_ref_to_reference)
237 .collect();
238 Ok(Some(references))
239 }
240 Err(e) => {
241 tracing::debug!("Failed to parse ref-list as whole: {}", e);
242 Ok(None)
243 }
244 }
245}
246
247fn try_extract_from_references_tag(content: &str) -> Result<Option<Vec<Reference>>> {
249 let references_content = if let Some(start) = content.find("<references") {
251 if let Some(end) = content[start..].find("</references>") {
252 &content[start..start + end + 13] } else {
254 return Ok(None);
255 }
256 } else {
257 return Ok(None);
258 };
259
260 let adapted_content = references_content
262 .replace("<references", "<ref-list")
263 .replace("</references>", "</ref-list>");
264
265 let cleaned_adapted = strip_inline_html_tags(&adapted_content);
266 let cleaned_adapted = strip_comment_tags(&cleaned_adapted);
267 match from_str::<RefList>(&cleaned_adapted) {
268 Ok(ref_list) => {
269 let references = ref_list
270 .refs
271 .into_iter()
272 .filter_map(parse_ref_to_reference)
273 .collect();
274 Ok(Some(references))
275 }
276 Err(_) => Ok(None),
277 }
278}
279
280fn try_extract_from_back_section(content: &str) -> Result<Option<Vec<Reference>>> {
282 let back_content = if let Some(start) = content.find("<back>") {
284 if let Some(end) = content[start..].find("</back>") {
285 &content[start..start + end + 7] } else {
287 return Ok(None);
288 }
289 } else {
290 return Ok(None);
291 };
292
293 let mut references = Vec::new();
295 let mut pos = 0;
296
297 while let Some(ref_start) = back_content[pos..].find("<ref ") {
298 let ref_start = pos + ref_start;
299 if let Some(ref_end) = back_content[ref_start..].find("</ref>") {
300 let ref_end = ref_start + ref_end + 6; let ref_content = &back_content[ref_start..ref_end];
302
303 let wrapped_content = format!("<ref-list>{}</ref-list>", ref_content);
305 let cleaned_wrapped = strip_inline_html_tags(&wrapped_content);
306 let cleaned_wrapped = strip_comment_tags(&cleaned_wrapped);
307
308 if let Ok(ref_list) = from_str::<RefList>(&cleaned_wrapped) {
309 for ref_item in ref_list.refs {
310 if let Some(reference) = parse_ref_to_reference(ref_item) {
311 references.push(reference);
312 }
313 }
314 }
315
316 pos = ref_end;
317 } else {
318 break;
319 }
320 }
321
322 if references.is_empty() {
323 Ok(None)
324 } else {
325 Ok(Some(references))
326 }
327}
328
329fn parse_ref_to_reference(ref_elem: Ref) -> Option<Reference> {
331 let id = ref_elem.id.unwrap_or_else(|| String::from("unknown"));
332
333 let citation = ref_elem
335 .element_citation
336 .map(Citation::Element)
337 .or_else(|| ref_elem.mixed_citation.map(Citation::Mixed));
338
339 let citation = citation?;
340
341 let (
342 publication_type,
343 title,
344 source,
345 year,
346 volume,
347 issue,
348 fpage,
349 lpage,
350 pub_ids,
351 person_groups,
352 ) = match citation {
353 Citation::Element(elem) => (
354 elem.publication_type,
355 elem.article_title,
356 elem.source,
357 elem.year,
358 elem.volume,
359 elem.issue,
360 elem.fpage,
361 elem.lpage,
362 elem.pub_ids,
363 elem.person_groups,
364 ),
365 Citation::Mixed(mixed) => (
366 mixed.publication_type,
367 mixed.article_title,
368 mixed.source,
369 mixed.year,
370 mixed.volume,
371 mixed.issue,
372 mixed.fpage,
373 mixed.lpage,
374 mixed.pub_ids,
375 mixed.person_groups,
376 ),
377 };
378
379 let mut doi = None;
380 let mut pmid = None;
381 for pub_id in pub_ids {
382 if let (Some(id_type), Some(value)) = (pub_id.pub_id_type, pub_id.value) {
383 match id_type.as_str() {
384 "doi" => doi = Some(value),
385 "pmid" => pmid = Some(value),
386 _ => {}
387 }
388 }
389 }
390
391 Some(Reference {
392 id,
393 publication_type,
394 title,
395 authors: extract_authors_from_person_groups(person_groups),
396 source,
397 year,
398 volume,
399 issue,
400 pages: format_pages(fpage, lpage),
401 pmid,
402 doi,
403 })
404}
405
406enum Citation {
408 Element(ElementCitation),
409 Mixed(MixedCitation),
410}
411
412fn format_pages(fpage: Option<String>, lpage: Option<String>) -> Option<String> {
414 match (fpage, lpage) {
415 (Some(f), Some(l)) => Some(format!("{}-{}", f, l)),
416 (Some(f), None) => Some(f),
417 _ => None,
418 }
419}
420
421fn extract_authors_from_person_groups(person_groups: Vec<PersonGroup>) -> Vec<Author> {
423 let mut authors = Vec::new();
424
425 for group in person_groups {
426 if group.person_group_type.as_deref() == Some("author") || group.person_group_type.is_none()
428 {
429 for name in group.names {
430 let author = Author::new(name.surname.clone(), name.given_names.clone());
431 authors.push(author);
432 }
433 }
434 }
435
436 authors
437}
438
439#[cfg(test)]
440mod tests {
441 use super::*;
442
443 #[test]
444 fn test_extract_references_detailed() {
445 let content = r#"
446 <ref-list>
447 <ref id="ref1">
448 <element-citation publication-type="journal">
449 <person-group person-group-type="author">
450 <name>
451 <surname>Smith</surname>
452 <given-names>J</given-names>
453 </name>
454 </person-group>
455 <article-title>Test Article</article-title>
456 <source>Test Journal</source>
457 <year>2023</year>
458 <volume>10</volume>
459 <issue>2</issue>
460 <fpage>123</fpage>
461 <lpage>130</lpage>
462 <pub-id pub-id-type="doi">10.1234/test</pub-id>
463 </element-citation>
464 </ref>
465 </ref-list>
466 "#;
467
468 let references = extract_references_detailed(content).unwrap();
469 assert_eq!(references.len(), 1);
470
471 let ref1 = &references[0];
472 assert_eq!(ref1.id, "ref1");
473 assert_eq!(ref1.title, Some("Test Article".to_string()));
474 assert_eq!(ref1.source, Some("Test Journal".to_string()));
475 assert_eq!(ref1.year, Some("2023".to_string()));
476 assert_eq!(ref1.volume, Some("10".to_string()));
477 assert_eq!(ref1.issue, Some("2".to_string()));
478 assert_eq!(ref1.pages, Some("123-130".to_string()));
479 assert_eq!(ref1.doi, Some("10.1234/test".to_string()));
480 assert_eq!(ref1.authors.len(), 1);
481 }
482
483 #[test]
484 fn test_extract_references_no_ref_list() {
485 let content = "<article>No references here</article>";
486 let references = extract_references_detailed(content).unwrap();
487 assert_eq!(references.len(), 0);
488 }
489
490 #[test]
491 fn test_extract_references_invalid_xml() {
492 let content = "<ref-list><ref>Invalid XML</ref-list>";
495 let result = extract_references_detailed(content);
496 assert!(result.is_ok());
497 assert_eq!(result.unwrap().len(), 0);
498 }
499
500 #[test]
501 fn test_extract_references_with_comments_and_etal() {
502 let content = r#"<ref-list id="bibl10"><title>References</title>
504<ref id="bib3"><label>3</label><element-citation publication-type="journal" id="sbref30"><person-group person-group-type="author"><name name-style="western"><surname>Alvarez</surname><given-names>C</given-names></name><etal/></person-group><article-title>Test Article</article-title><source>MedRxiv</source><year>2021</year><comment>published online 20.</comment><pub-id pub-id-type="doi">10.1234/test</pub-id><comment>(preprint)</comment><pub-id pub-id-type="pmcid">PMC123</pub-id><pub-id pub-id-type="pmid">123</pub-id></element-citation></ref>
505</ref-list>"#;
506
507 let references = extract_references_detailed(content).unwrap();
508 assert_eq!(
509 references.len(),
510 1,
511 "Should parse ref with comments and etal"
512 );
513
514 let ref3 = &references[0];
515 assert_eq!(ref3.id, "bib3");
516 assert_eq!(ref3.title, Some("Test Article".to_string()));
517 assert_eq!(ref3.source, Some("MedRxiv".to_string()));
518 assert_eq!(ref3.authors.len(), 1);
519 assert_eq!(ref3.authors[0].surname, Some("Alvarez".to_string()));
520 }
521}