pubmed_parser/pmc/parser/
author.rs

1use crate::common::xml_utils::strip_inline_html_tags;
2use crate::common::{Affiliation, Author};
3use crate::error::{ParseError, Result};
4use quick_xml::de::from_str;
5use serde::Deserialize;
6
7/// XML structure for contrib-group element
8#[derive(Debug, Deserialize)]
9#[serde(rename = "contrib-group")]
10struct ContribGroup {
11    #[serde(rename = "contrib", default)]
12    contribs: Vec<Contrib>,
13}
14
15/// XML structure for contrib element
16#[derive(Debug, Deserialize)]
17struct Contrib {
18    #[serde(rename = "@corresp", default)]
19    corresp: Option<String>,
20
21    #[serde(rename = "contrib-id", default)]
22    contrib_ids: Vec<ContribId>,
23
24    #[serde(rename = "name", default)]
25    name: Option<Name>,
26
27    #[serde(rename = "email", default)]
28    email: Option<String>,
29
30    #[serde(rename = "role", default)]
31    roles: Vec<String>,
32
33    #[serde(rename = "xref", default)]
34    xrefs: Vec<Xref>,
35
36    #[serde(rename = "aff", default)]
37    affs: Vec<Aff>,
38}
39
40/// XML structure for contrib-id element
41#[derive(Debug, Deserialize)]
42struct ContribId {
43    #[serde(rename = "@contrib-id-type")]
44    contrib_id_type: Option<String>,
45
46    #[serde(rename = "$text")]
47    value: Option<String>,
48}
49
50/// XML structure for name element
51#[derive(Debug, Deserialize)]
52struct Name {
53    #[serde(rename = "@name-style", default)]
54    #[allow(dead_code)]
55    name_style: Option<String>,
56
57    #[serde(rename = "surname", default)]
58    surname: Option<String>,
59
60    #[serde(rename = "given-names", default)]
61    given_names: Option<String>,
62
63    #[serde(rename = "suffix", default)]
64    suffix: Option<String>,
65}
66
67/// XML structure for xref element
68#[derive(Debug, Deserialize)]
69struct Xref {
70    #[serde(rename = "@ref-type")]
71    ref_type: Option<String>,
72
73    #[serde(rename = "@rid")]
74    rid: Option<String>,
75}
76
77/// XML structure for aff element
78#[derive(Debug, Deserialize)]
79struct Aff {
80    #[serde(rename = "@id")]
81    id: Option<String>,
82
83    #[serde(rename = "$text", default)]
84    text: Option<String>,
85
86    #[serde(rename = "institution", default)]
87    #[allow(dead_code)]
88    institutions: Vec<String>,
89
90    #[serde(rename = "addr-line", default)]
91    #[allow(dead_code)]
92    addr_lines: Vec<String>,
93
94    #[serde(rename = "country", default)]
95    #[allow(dead_code)]
96    countries: Vec<String>,
97}
98
99/// XML structure for element-citation or mixed-citation
100#[derive(Debug, Deserialize)]
101#[serde(rename_all = "kebab-case")]
102struct Citation {
103    #[serde(rename = "person-group", default)]
104    person_groups: Vec<PersonGroup>,
105    #[serde(rename = "name", default)]
106    names: Vec<Name>,
107}
108
109/// XML structure for person-group element
110#[derive(Debug, Deserialize)]
111#[serde(rename = "person-group")]
112struct PersonGroup {
113    #[serde(rename = "@person-group-type")]
114    _group_type: Option<String>,
115    #[serde(rename = "name", default)]
116    names: Vec<Name>,
117}
118
119/// Extract authors from PMC XML content
120pub fn extract_authors(content: &str) -> Result<Vec<Author>> {
121    // Find and extract the contrib-group section
122    if let Some(contrib_start) = content.find("<contrib-group>") {
123        if let Some(contrib_end) = content[contrib_start..].find("</contrib-group>") {
124            let contrib_section =
125                &content[contrib_start..contrib_start + contrib_end + "</contrib-group>".len()];
126
127            // Try to deserialize the contrib-group (strip inline HTML tags first)
128            let cleaned_section = strip_inline_html_tags(contrib_section);
129            match from_str::<ContribGroup>(&cleaned_section) {
130                Ok(contrib_group) => {
131                    let authors = contrib_group
132                        .contribs
133                        .into_iter()
134                        .filter_map(parse_contrib_to_author)
135                        .collect();
136                    Ok(authors)
137                }
138                Err(e) => {
139                    // Log the error but continue with empty authors rather than failing completely
140                    tracing::warn!(
141                        "Failed to parse contrib-group XML ({}), continuing with empty authors",
142                        e
143                    );
144                    Ok(Vec::new())
145                }
146            }
147        } else {
148            Err(ParseError::XmlError(
149                "Found contrib-group start tag but no matching end tag".to_string(),
150            ))
151        }
152    } else {
153        // No contrib-group found - return empty vector as success
154        Ok(Vec::new())
155    }
156}
157
158/// Convert a Contrib to an Author
159fn parse_contrib_to_author(contrib: Contrib) -> Option<Author> {
160    let name = contrib.name?;
161
162    let mut author = Author::new(name.surname.clone(), name.given_names.clone());
163    author.suffix = name.suffix;
164
165    // Extract ORCID from contrib-id tags
166    for contrib_id in &contrib.contrib_ids {
167        if let Some(id_type) = &contrib_id.contrib_id_type
168            && id_type == "orcid"
169            && let Some(value) = &contrib_id.value
170        {
171            let clean_orcid = value.trim();
172            if clean_orcid.contains("orcid.org") || !clean_orcid.is_empty() {
173                author.orcid = Some(clean_orcid.to_string());
174                break;
175            }
176        }
177    }
178
179    // Set email
180    author.email = contrib.email.map(|e| e.trim().to_string());
181
182    // Set corresponding author flag (check both corresp="yes" attribute and <xref ref-type="corresp">)
183    author.is_corresponding = contrib.corresp.map(|c| c == "yes").unwrap_or(false)
184        || contrib
185            .xrefs
186            .iter()
187            .any(|x| x.ref_type.as_deref() == Some("corresp"));
188
189    // Set roles
190    author.roles = contrib
191        .roles
192        .into_iter()
193        .map(|r| r.trim().to_string())
194        .filter(|r| !r.is_empty())
195        .collect();
196
197    // Extract affiliations from xrefs
198    let mut affiliations = Vec::new();
199
200    // Process xref affiliations
201    for xref in &contrib.xrefs {
202        if let Some(ref_type) = &xref.ref_type
203            && ref_type == "aff"
204            && let Some(rid) = &xref.rid
205        {
206            affiliations.push(Affiliation {
207                id: Some(rid.clone()),
208                institution: Some(rid.clone()), // Use rid as institution for now
209                department: None,
210                address: None,
211                country: None,
212            });
213        }
214    }
215
216    // Process direct affiliations
217    for aff in &contrib.affs {
218        if let Some(text) = &aff.text {
219            let clean_text = text.trim();
220            if !clean_text.is_empty() {
221                affiliations.push(Affiliation {
222                    id: aff.id.clone(),
223                    institution: Some(clean_text.to_string()),
224                    department: None,
225                    address: None,
226                    country: None,
227                });
228            }
229        }
230    }
231
232    author.affiliations = affiliations;
233
234    Some(author)
235}
236
237/// Extract authors from reference sections
238pub fn extract_reference_authors(ref_content: &str) -> Result<Vec<Author>> {
239    let mut authors = Vec::new();
240
241    // Try to parse as element-citation
242    if ref_content.contains("<element-citation") {
243        if let Some(start) = ref_content.find("<element-citation")
244            && let Some(end) = ref_content[start..].find("</element-citation>")
245        {
246            let citation_content = &ref_content[start..start + end + "</element-citation>".len()];
247            let cleaned_citation = strip_inline_html_tags(citation_content);
248            match from_str::<Citation>(&cleaned_citation) {
249                Ok(citation) => {
250                    // Extract names from person-groups first
251                    for person_group in citation.person_groups {
252                        for name in person_group.names {
253                            authors.push(Author::new(name.surname, name.given_names));
254                        }
255                    }
256                    // Also check for direct names (without person-group wrapper)
257                    for name in citation.names {
258                        authors.push(Author::new(name.surname, name.given_names));
259                    }
260                    if !authors.is_empty() {
261                        return Ok(authors);
262                    }
263                }
264                Err(e) => {
265                    return Err(ParseError::XmlError(format!(
266                        "Failed to parse element-citation XML: {}",
267                        e
268                    )));
269                }
270            }
271        } else {
272            return Err(ParseError::XmlError(
273                "Found element-citation start tag but no matching end tag".to_string(),
274            ));
275        }
276    }
277
278    // Try to parse as mixed-citation
279    if ref_content.contains("<mixed-citation") {
280        if let Some(start) = ref_content.find("<mixed-citation")
281            && let Some(end) = ref_content[start..].find("</mixed-citation>")
282        {
283            let citation_content = &ref_content[start..start + end + "</mixed-citation>".len()];
284            let cleaned_citation = strip_inline_html_tags(citation_content);
285            match from_str::<Citation>(&cleaned_citation) {
286                Ok(citation) => {
287                    // Extract names from person-groups first
288                    for person_group in citation.person_groups {
289                        for name in person_group.names {
290                            authors.push(Author::new(name.surname, name.given_names));
291                        }
292                    }
293                    // Also check for direct names (without person-group wrapper)
294                    for name in citation.names {
295                        authors.push(Author::new(name.surname, name.given_names));
296                    }
297                    if !authors.is_empty() {
298                        return Ok(authors);
299                    }
300                }
301                Err(e) => {
302                    return Err(ParseError::XmlError(format!(
303                        "Failed to parse mixed-citation XML: {}",
304                        e
305                    )));
306                }
307            }
308        } else {
309            return Err(ParseError::XmlError(
310                "Found mixed-citation start tag but no matching end tag".to_string(),
311            ));
312        }
313    }
314
315    // No citations found or no authors in citations - return empty vector as success
316    Ok(authors)
317}
318
319#[cfg(test)]
320mod tests {
321    use super::*;
322
323    #[test]
324    fn test_extract_authors_detailed() {
325        let content = r#"
326        <contrib-group>
327            <contrib corresp="yes">
328                <name>
329                    <surname>Doe</surname>
330                    <given-names>John</given-names>
331                </name>
332                <email>john.doe@example.com</email>
333                <role>Principal Investigator</role>
334            </contrib>
335        </contrib-group>
336        "#;
337
338        let authors = extract_authors(content).unwrap();
339        assert_eq!(authors.len(), 1);
340        assert_eq!(authors[0].surname, Some("Doe".to_string()));
341        assert_eq!(authors[0].given_names, Some("John".to_string()));
342        assert!(authors[0].is_corresponding);
343        assert_eq!(authors[0].email, Some("john.doe@example.com".to_string()));
344        assert_eq!(authors[0].roles, vec!["Principal Investigator"]);
345    }
346
347    #[test]
348    fn test_extract_reference_authors() {
349        let content = r#"
350        <element-citation>
351            <name>
352                <surname>Johnson</surname>
353                <given-names>Alice</given-names>
354            </name>
355            <name>
356                <surname>Williams</surname>
357                <given-names>Bob</given-names>
358            </name>
359        </element-citation>
360        "#;
361
362        let authors = extract_reference_authors(content).unwrap();
363        assert_eq!(authors.len(), 2);
364        assert_eq!(authors[0].surname, Some("Johnson".to_string()));
365        assert_eq!(authors[0].given_names, Some("Alice".to_string()));
366        assert_eq!(authors[1].surname, Some("Williams".to_string()));
367        assert_eq!(authors[1].given_names, Some("Bob".to_string()));
368    }
369
370    #[test]
371    fn test_extract_orcid_from_contrib_id() {
372        let content = r#"
373        <contrib-group>
374            <contrib corresp="yes">
375                <contrib-id contrib-id-type="orcid">https://orcid.org/0000-0002-3066-2940</contrib-id>
376                <name name-style="western">
377                    <surname>Doe</surname>
378                    <given-names>John</given-names>
379                </name>
380                <email>john.doe@example.com</email>
381            </contrib>
382        </contrib-group>
383        "#;
384
385        let authors = extract_authors(content).unwrap();
386        assert_eq!(authors.len(), 1);
387        assert_eq!(authors[0].surname, Some("Doe".to_string()));
388        assert_eq!(authors[0].given_names, Some("John".to_string()));
389        assert_eq!(
390            authors[0].orcid,
391            Some("https://orcid.org/0000-0002-3066-2940".to_string())
392        );
393        assert!(authors[0].is_corresponding);
394    }
395
396    #[test]
397    fn test_extract_orcid_with_xml_tags() {
398        let content = r#"
399        <contrib-group>
400            <contrib>
401                <contrib-id contrib-id-type="orcid">https://orcid.org/0000-0001-2345-6789</contrib-id><name name-style="western">
402                    <surname>Smith</surname>
403                    <given-names>Jane</given-names>
404                </name>
405            </contrib>
406        </contrib-group>
407        "#;
408
409        let authors = extract_authors(content).unwrap();
410        assert_eq!(authors.len(), 1);
411        assert_eq!(authors[0].surname, Some("Smith".to_string()));
412        assert_eq!(authors[0].given_names, Some("Jane".to_string()));
413        assert_eq!(
414            authors[0].orcid,
415            Some("https://orcid.org/0000-0001-2345-6789".to_string())
416        );
417        assert!(!authors[0].is_corresponding);
418    }
419
420    #[test]
421    fn test_extract_multiple_authors_with_orcid() {
422        let content = r#"
423        <contrib-group>
424            <contrib>
425                <contrib-id contrib-id-type="orcid">https://orcid.org/0000-0001-1111-1111</contrib-id>
426                <name>
427                    <surname>First</surname>
428                    <given-names>Author</given-names>
429                </name>
430            </contrib>
431            <contrib corresp="yes">
432                <contrib-id contrib-id-type="orcid">https://orcid.org/0000-0002-2222-2222</contrib-id>
433                <name>
434                    <surname>Second</surname>
435                    <given-names>Author</given-names>
436                </name>
437            </contrib>
438            <contrib>
439                <name>
440                    <surname>Third</surname>
441                    <given-names>Author</given-names>
442                </name>
443            </contrib>
444        </contrib-group>
445        "#;
446
447        let authors = extract_authors(content).unwrap();
448        assert_eq!(authors.len(), 3);
449
450        // First author with ORCID
451        assert_eq!(authors[0].surname, Some("First".to_string()));
452        assert_eq!(
453            authors[0].orcid,
454            Some("https://orcid.org/0000-0001-1111-1111".to_string())
455        );
456        assert!(!authors[0].is_corresponding);
457
458        // Second author with ORCID and corresponding
459        assert_eq!(authors[1].surname, Some("Second".to_string()));
460        assert_eq!(
461            authors[1].orcid,
462            Some("https://orcid.org/0000-0002-2222-2222".to_string())
463        );
464        assert!(authors[1].is_corresponding);
465
466        // Third author without ORCID
467        assert_eq!(authors[2].surname, Some("Third".to_string()));
468        assert_eq!(authors[2].orcid, None);
469        assert!(!authors[2].is_corresponding);
470    }
471}
pubmed_parser/pmc/parser/author.rs

pubmed_parser/pmc/parser/
author.rs