pubmed_parser/pmc/parser/
author.rs1use crate::common::xml_utils::strip_inline_html_tags;
2use crate::common::{Affiliation, Author};
3use crate::error::{ParseError, Result};
4use quick_xml::de::from_str;
5use serde::Deserialize;
6
7#[derive(Debug, Deserialize)]
9#[serde(rename = "contrib-group")]
10struct ContribGroup {
11 #[serde(rename = "contrib", default)]
12 contribs: Vec<Contrib>,
13}
14
15#[derive(Debug, Deserialize)]
17struct Contrib {
18 #[serde(rename = "@corresp", default)]
19 corresp: Option<String>,
20
21 #[serde(rename = "contrib-id", default)]
22 contrib_ids: Vec<ContribId>,
23
24 #[serde(rename = "name", default)]
25 name: Option<Name>,
26
27 #[serde(rename = "email", default)]
28 email: Option<String>,
29
30 #[serde(rename = "role", default)]
31 roles: Vec<String>,
32
33 #[serde(rename = "xref", default)]
34 xrefs: Vec<Xref>,
35
36 #[serde(rename = "aff", default)]
37 affs: Vec<Aff>,
38}
39
40#[derive(Debug, Deserialize)]
42struct ContribId {
43 #[serde(rename = "@contrib-id-type")]
44 contrib_id_type: Option<String>,
45
46 #[serde(rename = "$text")]
47 value: Option<String>,
48}
49
50#[derive(Debug, Deserialize)]
52struct Name {
53 #[serde(rename = "@name-style", default)]
54 #[allow(dead_code)]
55 name_style: Option<String>,
56
57 #[serde(rename = "surname", default)]
58 surname: Option<String>,
59
60 #[serde(rename = "given-names", default)]
61 given_names: Option<String>,
62
63 #[serde(rename = "suffix", default)]
64 suffix: Option<String>,
65}
66
67#[derive(Debug, Deserialize)]
69struct Xref {
70 #[serde(rename = "@ref-type")]
71 ref_type: Option<String>,
72
73 #[serde(rename = "@rid")]
74 rid: Option<String>,
75}
76
77#[derive(Debug, Deserialize)]
79struct Aff {
80 #[serde(rename = "@id")]
81 id: Option<String>,
82
83 #[serde(rename = "$text", default)]
84 text: Option<String>,
85
86 #[serde(rename = "institution", default)]
87 #[allow(dead_code)]
88 institutions: Vec<String>,
89
90 #[serde(rename = "addr-line", default)]
91 #[allow(dead_code)]
92 addr_lines: Vec<String>,
93
94 #[serde(rename = "country", default)]
95 #[allow(dead_code)]
96 countries: Vec<String>,
97}
98
99#[derive(Debug, Deserialize)]
101#[serde(rename_all = "kebab-case")]
102struct Citation {
103 #[serde(rename = "person-group", default)]
104 person_groups: Vec<PersonGroup>,
105 #[serde(rename = "name", default)]
106 names: Vec<Name>,
107}
108
109#[derive(Debug, Deserialize)]
111#[serde(rename = "person-group")]
112struct PersonGroup {
113 #[serde(rename = "@person-group-type")]
114 _group_type: Option<String>,
115 #[serde(rename = "name", default)]
116 names: Vec<Name>,
117}
118
119pub fn extract_authors(content: &str) -> Result<Vec<Author>> {
121 if let Some(contrib_start) = content.find("<contrib-group>") {
123 if let Some(contrib_end) = content[contrib_start..].find("</contrib-group>") {
124 let contrib_section =
125 &content[contrib_start..contrib_start + contrib_end + "</contrib-group>".len()];
126
127 let cleaned_section = strip_inline_html_tags(contrib_section);
129 match from_str::<ContribGroup>(&cleaned_section) {
130 Ok(contrib_group) => {
131 let authors = contrib_group
132 .contribs
133 .into_iter()
134 .filter_map(parse_contrib_to_author)
135 .collect();
136 Ok(authors)
137 }
138 Err(e) => {
139 tracing::warn!(
141 "Failed to parse contrib-group XML ({}), continuing with empty authors",
142 e
143 );
144 Ok(Vec::new())
145 }
146 }
147 } else {
148 Err(ParseError::XmlError(
149 "Found contrib-group start tag but no matching end tag".to_string(),
150 ))
151 }
152 } else {
153 Ok(Vec::new())
155 }
156}
157
158fn parse_contrib_to_author(contrib: Contrib) -> Option<Author> {
160 let name = contrib.name?;
161
162 let mut author = Author::new(name.surname.clone(), name.given_names.clone());
163 author.suffix = name.suffix;
164
165 for contrib_id in &contrib.contrib_ids {
167 if let Some(id_type) = &contrib_id.contrib_id_type
168 && id_type == "orcid"
169 && let Some(value) = &contrib_id.value
170 {
171 let clean_orcid = value.trim();
172 if clean_orcid.contains("orcid.org") || !clean_orcid.is_empty() {
173 author.orcid = Some(clean_orcid.to_string());
174 break;
175 }
176 }
177 }
178
179 author.email = contrib.email.map(|e| e.trim().to_string());
181
182 author.is_corresponding = contrib.corresp.map(|c| c == "yes").unwrap_or(false)
184 || contrib
185 .xrefs
186 .iter()
187 .any(|x| x.ref_type.as_deref() == Some("corresp"));
188
189 author.roles = contrib
191 .roles
192 .into_iter()
193 .map(|r| r.trim().to_string())
194 .filter(|r| !r.is_empty())
195 .collect();
196
197 let mut affiliations = Vec::new();
199
200 for xref in &contrib.xrefs {
202 if let Some(ref_type) = &xref.ref_type
203 && ref_type == "aff"
204 && let Some(rid) = &xref.rid
205 {
206 affiliations.push(Affiliation {
207 id: Some(rid.clone()),
208 institution: Some(rid.clone()), department: None,
210 address: None,
211 country: None,
212 });
213 }
214 }
215
216 for aff in &contrib.affs {
218 if let Some(text) = &aff.text {
219 let clean_text = text.trim();
220 if !clean_text.is_empty() {
221 affiliations.push(Affiliation {
222 id: aff.id.clone(),
223 institution: Some(clean_text.to_string()),
224 department: None,
225 address: None,
226 country: None,
227 });
228 }
229 }
230 }
231
232 author.affiliations = affiliations;
233
234 Some(author)
235}
236
237pub fn extract_reference_authors(ref_content: &str) -> Result<Vec<Author>> {
239 let mut authors = Vec::new();
240
241 if ref_content.contains("<element-citation") {
243 if let Some(start) = ref_content.find("<element-citation")
244 && let Some(end) = ref_content[start..].find("</element-citation>")
245 {
246 let citation_content = &ref_content[start..start + end + "</element-citation>".len()];
247 let cleaned_citation = strip_inline_html_tags(citation_content);
248 match from_str::<Citation>(&cleaned_citation) {
249 Ok(citation) => {
250 for person_group in citation.person_groups {
252 for name in person_group.names {
253 authors.push(Author::new(name.surname, name.given_names));
254 }
255 }
256 for name in citation.names {
258 authors.push(Author::new(name.surname, name.given_names));
259 }
260 if !authors.is_empty() {
261 return Ok(authors);
262 }
263 }
264 Err(e) => {
265 return Err(ParseError::XmlError(format!(
266 "Failed to parse element-citation XML: {}",
267 e
268 )));
269 }
270 }
271 } else {
272 return Err(ParseError::XmlError(
273 "Found element-citation start tag but no matching end tag".to_string(),
274 ));
275 }
276 }
277
278 if ref_content.contains("<mixed-citation") {
280 if let Some(start) = ref_content.find("<mixed-citation")
281 && let Some(end) = ref_content[start..].find("</mixed-citation>")
282 {
283 let citation_content = &ref_content[start..start + end + "</mixed-citation>".len()];
284 let cleaned_citation = strip_inline_html_tags(citation_content);
285 match from_str::<Citation>(&cleaned_citation) {
286 Ok(citation) => {
287 for person_group in citation.person_groups {
289 for name in person_group.names {
290 authors.push(Author::new(name.surname, name.given_names));
291 }
292 }
293 for name in citation.names {
295 authors.push(Author::new(name.surname, name.given_names));
296 }
297 if !authors.is_empty() {
298 return Ok(authors);
299 }
300 }
301 Err(e) => {
302 return Err(ParseError::XmlError(format!(
303 "Failed to parse mixed-citation XML: {}",
304 e
305 )));
306 }
307 }
308 } else {
309 return Err(ParseError::XmlError(
310 "Found mixed-citation start tag but no matching end tag".to_string(),
311 ));
312 }
313 }
314
315 Ok(authors)
317}
318
319#[cfg(test)]
320mod tests {
321 use super::*;
322
323 #[test]
324 fn test_extract_authors_detailed() {
325 let content = r#"
326 <contrib-group>
327 <contrib corresp="yes">
328 <name>
329 <surname>Doe</surname>
330 <given-names>John</given-names>
331 </name>
332 <email>john.doe@example.com</email>
333 <role>Principal Investigator</role>
334 </contrib>
335 </contrib-group>
336 "#;
337
338 let authors = extract_authors(content).unwrap();
339 assert_eq!(authors.len(), 1);
340 assert_eq!(authors[0].surname, Some("Doe".to_string()));
341 assert_eq!(authors[0].given_names, Some("John".to_string()));
342 assert!(authors[0].is_corresponding);
343 assert_eq!(authors[0].email, Some("john.doe@example.com".to_string()));
344 assert_eq!(authors[0].roles, vec!["Principal Investigator"]);
345 }
346
347 #[test]
348 fn test_extract_reference_authors() {
349 let content = r#"
350 <element-citation>
351 <name>
352 <surname>Johnson</surname>
353 <given-names>Alice</given-names>
354 </name>
355 <name>
356 <surname>Williams</surname>
357 <given-names>Bob</given-names>
358 </name>
359 </element-citation>
360 "#;
361
362 let authors = extract_reference_authors(content).unwrap();
363 assert_eq!(authors.len(), 2);
364 assert_eq!(authors[0].surname, Some("Johnson".to_string()));
365 assert_eq!(authors[0].given_names, Some("Alice".to_string()));
366 assert_eq!(authors[1].surname, Some("Williams".to_string()));
367 assert_eq!(authors[1].given_names, Some("Bob".to_string()));
368 }
369
370 #[test]
371 fn test_extract_orcid_from_contrib_id() {
372 let content = r#"
373 <contrib-group>
374 <contrib corresp="yes">
375 <contrib-id contrib-id-type="orcid">https://orcid.org/0000-0002-3066-2940</contrib-id>
376 <name name-style="western">
377 <surname>Doe</surname>
378 <given-names>John</given-names>
379 </name>
380 <email>john.doe@example.com</email>
381 </contrib>
382 </contrib-group>
383 "#;
384
385 let authors = extract_authors(content).unwrap();
386 assert_eq!(authors.len(), 1);
387 assert_eq!(authors[0].surname, Some("Doe".to_string()));
388 assert_eq!(authors[0].given_names, Some("John".to_string()));
389 assert_eq!(
390 authors[0].orcid,
391 Some("https://orcid.org/0000-0002-3066-2940".to_string())
392 );
393 assert!(authors[0].is_corresponding);
394 }
395
396 #[test]
397 fn test_extract_orcid_with_xml_tags() {
398 let content = r#"
399 <contrib-group>
400 <contrib>
401 <contrib-id contrib-id-type="orcid">https://orcid.org/0000-0001-2345-6789</contrib-id><name name-style="western">
402 <surname>Smith</surname>
403 <given-names>Jane</given-names>
404 </name>
405 </contrib>
406 </contrib-group>
407 "#;
408
409 let authors = extract_authors(content).unwrap();
410 assert_eq!(authors.len(), 1);
411 assert_eq!(authors[0].surname, Some("Smith".to_string()));
412 assert_eq!(authors[0].given_names, Some("Jane".to_string()));
413 assert_eq!(
414 authors[0].orcid,
415 Some("https://orcid.org/0000-0001-2345-6789".to_string())
416 );
417 assert!(!authors[0].is_corresponding);
418 }
419
420 #[test]
421 fn test_extract_multiple_authors_with_orcid() {
422 let content = r#"
423 <contrib-group>
424 <contrib>
425 <contrib-id contrib-id-type="orcid">https://orcid.org/0000-0001-1111-1111</contrib-id>
426 <name>
427 <surname>First</surname>
428 <given-names>Author</given-names>
429 </name>
430 </contrib>
431 <contrib corresp="yes">
432 <contrib-id contrib-id-type="orcid">https://orcid.org/0000-0002-2222-2222</contrib-id>
433 <name>
434 <surname>Second</surname>
435 <given-names>Author</given-names>
436 </name>
437 </contrib>
438 <contrib>
439 <name>
440 <surname>Third</surname>
441 <given-names>Author</given-names>
442 </name>
443 </contrib>
444 </contrib-group>
445 "#;
446
447 let authors = extract_authors(content).unwrap();
448 assert_eq!(authors.len(), 3);
449
450 assert_eq!(authors[0].surname, Some("First".to_string()));
452 assert_eq!(
453 authors[0].orcid,
454 Some("https://orcid.org/0000-0001-1111-1111".to_string())
455 );
456 assert!(!authors[0].is_corresponding);
457
458 assert_eq!(authors[1].surname, Some("Second".to_string()));
460 assert_eq!(
461 authors[1].orcid,
462 Some("https://orcid.org/0000-0002-2222-2222".to_string())
463 );
464 assert!(authors[1].is_corresponding);
465
466 assert_eq!(authors[2].surname, Some("Third".to_string()));
468 assert_eq!(authors[2].orcid, None);
469 assert!(!authors[2].is_corresponding);
470 }
471}