pubmed_parser/pmc/parser/
reference.rs

1use crate::common::Author;
2use crate::common::xml_utils::strip_inline_html_tags;
3use crate::error::Result;
4use crate::pmc::domain::Reference;
5use quick_xml::de::from_str;
6use serde::Deserialize;
7use tracing;
8
9/// XML structure for ref-list element
10#[derive(Debug, Deserialize)]
11#[serde(rename = "ref-list")]
12struct RefList {
13    #[serde(rename = "@id", default)]
14    #[allow(dead_code)]
15    id: Option<String>,
16
17    #[serde(rename = "title", default)]
18    #[allow(dead_code)]
19    title: Option<String>,
20
21    #[serde(rename = "ref", default)]
22    refs: Vec<Ref>,
23}
24
25/// XML structure for ref element
26#[derive(Debug, Deserialize)]
27struct Ref {
28    #[serde(rename = "@id")]
29    id: Option<String>,
30
31    #[serde(rename = "label", default)]
32    #[allow(dead_code)]
33    label: Option<String>,
34
35    #[serde(rename = "element-citation", default)]
36    element_citation: Option<ElementCitation>,
37
38    #[serde(rename = "mixed-citation", default)]
39    mixed_citation: Option<MixedCitation>,
40}
41
42/// XML structure for element-citation
43#[derive(Debug, Deserialize)]
44#[serde(rename = "element-citation")]
45struct ElementCitation {
46    #[serde(rename = "@publication-type")]
47    publication_type: Option<String>,
48
49    #[serde(rename = "@id", default)]
50    #[allow(dead_code)]
51    citation_id: Option<String>,
52
53    #[serde(rename = "article-title", default)]
54    article_title: Option<String>,
55
56    #[serde(rename = "source", default)]
57    source: Option<String>,
58
59    #[serde(rename = "year", default)]
60    year: Option<String>,
61
62    #[serde(rename = "volume", default)]
63    volume: Option<String>,
64
65    #[serde(rename = "issue", default)]
66    issue: Option<String>,
67
68    #[serde(rename = "fpage", default)]
69    fpage: Option<String>,
70
71    #[serde(rename = "lpage", default)]
72    lpage: Option<String>,
73
74    #[serde(rename = "pub-id", default)]
75    pub_ids: Vec<PubId>,
76
77    #[serde(rename = "person-group", default)]
78    person_groups: Vec<PersonGroup>,
79}
80
81/// XML structure for mixed-citation (alternative citation format)
82#[derive(Debug, Deserialize)]
83#[serde(rename = "mixed-citation")]
84struct MixedCitation {
85    #[serde(rename = "@publication-type")]
86    publication_type: Option<String>,
87
88    #[serde(rename = "@id", default)]
89    #[allow(dead_code)]
90    citation_id: Option<String>,
91
92    #[serde(rename = "article-title", default)]
93    article_title: Option<String>,
94
95    #[serde(rename = "source", default)]
96    source: Option<String>,
97
98    #[serde(rename = "year", default)]
99    year: Option<String>,
100
101    #[serde(rename = "volume", default)]
102    volume: Option<String>,
103
104    #[serde(rename = "issue", default)]
105    issue: Option<String>,
106
107    #[serde(rename = "fpage", default)]
108    fpage: Option<String>,
109
110    #[serde(rename = "lpage", default)]
111    lpage: Option<String>,
112
113    #[serde(rename = "pub-id", default)]
114    pub_ids: Vec<PubId>,
115
116    #[serde(rename = "person-group", default)]
117    person_groups: Vec<PersonGroup>,
118}
119
120/// XML structure for pub-id element
121#[derive(Debug, Deserialize)]
122struct PubId {
123    #[serde(rename = "@pub-id-type")]
124    pub_id_type: Option<String>,
125
126    #[serde(rename = "$text")]
127    value: Option<String>,
128}
129
130/// XML structure for person-group element
131#[derive(Debug, Deserialize)]
132#[serde(rename = "person-group")]
133struct PersonGroup {
134    #[serde(rename = "@person-group-type")]
135    person_group_type: Option<String>,
136
137    #[serde(rename = "name", default)]
138    names: Vec<Name>,
139
140    #[serde(rename = "etal", default)]
141    #[allow(dead_code)]
142    etal: Option<String>,
143
144    #[serde(rename = "collab", default)]
145    #[allow(dead_code)]
146    collab: Option<String>,
147}
148
149/// XML structure for name element
150#[derive(Debug, Deserialize)]
151struct Name {
152    #[serde(rename = "@name-style", default)]
153    #[allow(dead_code)]
154    name_style: Option<String>,
155
156    #[serde(rename = "surname", default)]
157    surname: Option<String>,
158
159    #[serde(rename = "given-names", default)]
160    given_names: Option<String>,
161
162    #[serde(rename = "suffix", default)]
163    #[allow(dead_code)]
164    suffix: Option<String>,
165}
166
167/// Strip `<comment>...</comment>` elements from XML content.
168///
169/// These elements cause "duplicate field" errors in quick-xml serde deserialization
170/// when multiple `<comment>` elements appear in the same citation.
171fn strip_comment_tags(content: &str) -> String {
172    use regex::Regex;
173    use std::sync::OnceLock;
174
175    static COMMENT_RE: OnceLock<Regex> = OnceLock::new();
176    let re =
177        COMMENT_RE.get_or_init(|| Regex::new(r"<comment[^>]*>.*?</comment>").expect("valid regex"));
178    re.replace_all(content, "").into_owned()
179}
180
181/// Extract detailed references from ref-list or alternative reference structures
182pub fn extract_references_detailed(content: &str) -> Result<Vec<Reference>> {
183    // Try multiple reference extraction strategies to handle different PMC XML formats
184
185    // Strategy 1: Standard <ref-list> structure
186    if let Some(references) = try_extract_from_ref_list(content)? {
187        tracing::debug!(
188            count = references.len(),
189            "Extracted references from ref-list"
190        );
191        return Ok(references);
192    }
193
194    // Strategy 2: Alternative <references> structure
195    if let Some(references) = try_extract_from_references_tag(content)? {
196        tracing::debug!(
197            count = references.len(),
198            "Extracted references from references tag"
199        );
200        return Ok(references);
201    }
202
203    // Strategy 3: Direct <ref> tags in <back> section
204    if let Some(references) = try_extract_from_back_section(content)? {
205        tracing::debug!(
206            count = references.len(),
207            "Extracted references from back section"
208        );
209        return Ok(references);
210    }
211
212    // No references found with any strategy
213    Ok(Vec::new())
214}
215
216/// Try to extract references from standard <ref-list> structure
217fn try_extract_from_ref_list(content: &str) -> Result<Option<Vec<Reference>>> {
218    let ref_list_content = if let Some(start) = content.find("<ref-list") {
219        if let Some(end) = content[start..].find("</ref-list>") {
220            &content[start..start + end + 11] // +11 for "</ref-list>"
221        } else {
222            return Ok(None);
223        }
224    } else {
225        return Ok(None);
226    };
227
228    // Parse the ref-list (strip inline HTML tags and comment tags first)
229    let cleaned_content = strip_inline_html_tags(ref_list_content);
230    let cleaned_content = strip_comment_tags(&cleaned_content);
231    match from_str::<RefList>(&cleaned_content) {
232        Ok(ref_list) => {
233            let references = ref_list
234                .refs
235                .into_iter()
236                .filter_map(parse_ref_to_reference)
237                .collect();
238            Ok(Some(references))
239        }
240        Err(e) => {
241            tracing::debug!("Failed to parse ref-list as whole: {}", e);
242            Ok(None)
243        }
244    }
245}
246
247/// Try to extract references from alternative <references> structure
248fn try_extract_from_references_tag(content: &str) -> Result<Option<Vec<Reference>>> {
249    // Some PMC articles use <references> instead of <ref-list>
250    let references_content = if let Some(start) = content.find("<references") {
251        if let Some(end) = content[start..].find("</references>") {
252            &content[start..start + end + 13] // +13 for "</references>"
253        } else {
254            return Ok(None);
255        }
256    } else {
257        return Ok(None);
258    };
259
260    // Try to adapt the content to ref-list format for parsing
261    let adapted_content = references_content
262        .replace("<references", "<ref-list")
263        .replace("</references>", "</ref-list>");
264
265    let cleaned_adapted = strip_inline_html_tags(&adapted_content);
266    let cleaned_adapted = strip_comment_tags(&cleaned_adapted);
267    match from_str::<RefList>(&cleaned_adapted) {
268        Ok(ref_list) => {
269            let references = ref_list
270                .refs
271                .into_iter()
272                .filter_map(parse_ref_to_reference)
273                .collect();
274            Ok(Some(references))
275        }
276        Err(_) => Ok(None),
277    }
278}
279
280/// Try to extract references from direct <ref> tags in <back> section
281fn try_extract_from_back_section(content: &str) -> Result<Option<Vec<Reference>>> {
282    // Extract the back section
283    let back_content = if let Some(start) = content.find("<back>") {
284        if let Some(end) = content[start..].find("</back>") {
285            &content[start..start + end + 7] // +7 for "</back>"
286        } else {
287            return Ok(None);
288        }
289    } else {
290        return Ok(None);
291    };
292
293    // Look for <ref> tags directly in the back section
294    let mut references = Vec::new();
295    let mut pos = 0;
296
297    while let Some(ref_start) = back_content[pos..].find("<ref ") {
298        let ref_start = pos + ref_start;
299        if let Some(ref_end) = back_content[ref_start..].find("</ref>") {
300            let ref_end = ref_start + ref_end + 6; // +6 for "</ref>"
301            let ref_content = &back_content[ref_start..ref_end];
302
303            // Wrap the ref in a temporary ref-list structure to reuse existing parsing
304            let wrapped_content = format!("<ref-list>{}</ref-list>", ref_content);
305            let cleaned_wrapped = strip_inline_html_tags(&wrapped_content);
306            let cleaned_wrapped = strip_comment_tags(&cleaned_wrapped);
307
308            if let Ok(ref_list) = from_str::<RefList>(&cleaned_wrapped) {
309                for ref_item in ref_list.refs {
310                    if let Some(reference) = parse_ref_to_reference(ref_item) {
311                        references.push(reference);
312                    }
313                }
314            }
315
316            pos = ref_end;
317        } else {
318            break;
319        }
320    }
321
322    if references.is_empty() {
323        Ok(None)
324    } else {
325        Ok(Some(references))
326    }
327}
328
329/// Convert a Ref struct to a domain Reference
330fn parse_ref_to_reference(ref_elem: Ref) -> Option<Reference> {
331    let id = ref_elem.id.unwrap_or_else(|| String::from("unknown"));
332
333    // Try element-citation first, then mixed-citation
334    let citation = ref_elem
335        .element_citation
336        .map(Citation::Element)
337        .or_else(|| ref_elem.mixed_citation.map(Citation::Mixed));
338
339    let citation = citation?;
340
341    let (
342        publication_type,
343        title,
344        source,
345        year,
346        volume,
347        issue,
348        fpage,
349        lpage,
350        pub_ids,
351        person_groups,
352    ) = match citation {
353        Citation::Element(elem) => (
354            elem.publication_type,
355            elem.article_title,
356            elem.source,
357            elem.year,
358            elem.volume,
359            elem.issue,
360            elem.fpage,
361            elem.lpage,
362            elem.pub_ids,
363            elem.person_groups,
364        ),
365        Citation::Mixed(mixed) => (
366            mixed.publication_type,
367            mixed.article_title,
368            mixed.source,
369            mixed.year,
370            mixed.volume,
371            mixed.issue,
372            mixed.fpage,
373            mixed.lpage,
374            mixed.pub_ids,
375            mixed.person_groups,
376        ),
377    };
378
379    let mut doi = None;
380    let mut pmid = None;
381    for pub_id in pub_ids {
382        if let (Some(id_type), Some(value)) = (pub_id.pub_id_type, pub_id.value) {
383            match id_type.as_str() {
384                "doi" => doi = Some(value),
385                "pmid" => pmid = Some(value),
386                _ => {}
387            }
388        }
389    }
390
391    Some(Reference {
392        id,
393        publication_type,
394        title,
395        authors: extract_authors_from_person_groups(person_groups),
396        source,
397        year,
398        volume,
399        issue,
400        pages: format_pages(fpage, lpage),
401        pmid,
402        doi,
403    })
404}
405
406/// Helper enum to handle both citation types uniformly
407enum Citation {
408    Element(ElementCitation),
409    Mixed(MixedCitation),
410}
411
412/// Format page range from first and last page
413fn format_pages(fpage: Option<String>, lpage: Option<String>) -> Option<String> {
414    match (fpage, lpage) {
415        (Some(f), Some(l)) => Some(format!("{}-{}", f, l)),
416        (Some(f), None) => Some(f),
417        _ => None,
418    }
419}
420
421/// Extract authors from person groups
422fn extract_authors_from_person_groups(person_groups: Vec<PersonGroup>) -> Vec<Author> {
423    let mut authors = Vec::new();
424
425    for group in person_groups {
426        // Only process author groups (not editor, etc.)
427        if group.person_group_type.as_deref() == Some("author") || group.person_group_type.is_none()
428        {
429            for name in group.names {
430                let author = Author::new(name.surname.clone(), name.given_names.clone());
431                authors.push(author);
432            }
433        }
434    }
435
436    authors
437}
438
439#[cfg(test)]
440mod tests {
441    use super::*;
442
443    #[test]
444    fn test_extract_references_detailed() {
445        let content = r#"
446        <ref-list>
447            <ref id="ref1">
448                <element-citation publication-type="journal">
449                    <person-group person-group-type="author">
450                        <name>
451                            <surname>Smith</surname>
452                            <given-names>J</given-names>
453                        </name>
454                    </person-group>
455                    <article-title>Test Article</article-title>
456                    <source>Test Journal</source>
457                    <year>2023</year>
458                    <volume>10</volume>
459                    <issue>2</issue>
460                    <fpage>123</fpage>
461                    <lpage>130</lpage>
462                    <pub-id pub-id-type="doi">10.1234/test</pub-id>
463                </element-citation>
464            </ref>
465        </ref-list>
466        "#;
467
468        let references = extract_references_detailed(content).unwrap();
469        assert_eq!(references.len(), 1);
470
471        let ref1 = &references[0];
472        assert_eq!(ref1.id, "ref1");
473        assert_eq!(ref1.title, Some("Test Article".to_string()));
474        assert_eq!(ref1.source, Some("Test Journal".to_string()));
475        assert_eq!(ref1.year, Some("2023".to_string()));
476        assert_eq!(ref1.volume, Some("10".to_string()));
477        assert_eq!(ref1.issue, Some("2".to_string()));
478        assert_eq!(ref1.pages, Some("123-130".to_string()));
479        assert_eq!(ref1.doi, Some("10.1234/test".to_string()));
480        assert_eq!(ref1.authors.len(), 1);
481    }
482
483    #[test]
484    fn test_extract_references_no_ref_list() {
485        let content = "<article>No references here</article>";
486        let references = extract_references_detailed(content).unwrap();
487        assert_eq!(references.len(), 0);
488    }
489
490    #[test]
491    fn test_extract_references_invalid_xml() {
492        // The function is designed to be robust and handle malformed XML gracefully
493        // by returning an empty vector instead of erroring. This test verifies that behavior.
494        let content = "<ref-list><ref>Invalid XML</ref-list>";
495        let result = extract_references_detailed(content);
496        assert!(result.is_ok());
497        assert_eq!(result.unwrap().len(), 0);
498    }
499
500    #[test]
501    fn test_extract_references_with_comments_and_etal() {
502        // Test that the serde structs handle all elements that appear in real PMC XML
503        let content = r#"<ref-list id="bibl10"><title>References</title>
504<ref id="bib3"><label>3</label><element-citation publication-type="journal" id="sbref30"><person-group person-group-type="author"><name name-style="western"><surname>Alvarez</surname><given-names>C</given-names></name><etal/></person-group><article-title>Test Article</article-title><source>MedRxiv</source><year>2021</year><comment>published online 20.</comment><pub-id pub-id-type="doi">10.1234/test</pub-id><comment>(preprint)</comment><pub-id pub-id-type="pmcid">PMC123</pub-id><pub-id pub-id-type="pmid">123</pub-id></element-citation></ref>
505</ref-list>"#;
506
507        let references = extract_references_detailed(content).unwrap();
508        assert_eq!(
509            references.len(),
510            1,
511            "Should parse ref with comments and etal"
512        );
513
514        let ref3 = &references[0];
515        assert_eq!(ref3.id, "bib3");
516        assert_eq!(ref3.title, Some("Test Article".to_string()));
517        assert_eq!(ref3.source, Some("MedRxiv".to_string()));
518        assert_eq!(ref3.authors.len(), 1);
519        assert_eq!(ref3.authors[0].surname, Some("Alvarez".to_string()));
520    }
521}