1use crate::common::{HistoryDate, PublicationDate};
2use crate::pmc::domain::{FundingInfo, JournalMeta, SupplementaryMaterial};
3
4use super::xml_utils;
5
6pub fn extract_journal_info(content: &str) -> JournalMeta {
11 let title = xml_utils::extract_text_between(content, "<journal-title>", "</journal-title>")
12 .unwrap_or_else(|| "Unknown Journal".to_string());
13
14 let abbreviation = xml_utils::extract_text_between(
15 content,
16 "<journal-id journal-id-type=\"iso-abbrev\">",
17 "</journal-id>",
18 );
19
20 let mut issn_print = None;
22 let mut issn_electronic = None;
23 let mut pos = 0;
24 while let Some(issn_start) = content[pos..].find("<issn") {
25 let issn_start = pos + issn_start;
26 if let Some(issn_end) = content[issn_start..].find("</issn>") {
27 let issn_end = issn_start + issn_end;
28 let issn_section = &content[issn_start..issn_end];
29
30 if let Some(content_start) = issn_section.find(">") {
31 let issn_value = &issn_section[content_start + 1..];
32
33 if issn_section.contains("pub-type=\"epub\"") {
34 issn_electronic = Some(issn_value.to_string());
35 } else if issn_section.contains("pub-type=\"ppub\"") {
36 issn_print = Some(issn_value.to_string());
37 }
38 }
39 pos = issn_end;
40 } else {
41 break;
42 }
43 }
44
45 let publisher =
46 xml_utils::extract_text_between(content, "<publisher-name>", "</publisher-name>");
47
48 JournalMeta {
49 title,
50 abbreviation,
51 issn_print,
52 issn_electronic,
53 publisher,
54 }
55}
56
57pub fn extract_volume(content: &str) -> Option<String> {
59 xml_utils::extract_text_between(content, "<volume>", "</volume>")
60}
61
62pub fn extract_issue(content: &str) -> Option<String> {
64 xml_utils::extract_text_between(content, "<issue>", "</issue>")
65}
66
67pub fn extract_pub_dates(content: &str) -> Vec<PublicationDate> {
71 let mut dates = Vec::new();
72
73 let mut pos = 0;
74 while let Some(pd_start) = content[pos..].find("<pub-date") {
75 let pd_start = pos + pd_start;
76 if let Some(pd_end) = content[pd_start..].find("</pub-date>") {
77 let pd_end = pd_start + pd_end + "</pub-date>".len();
78 let pd_section = &content[pd_start..pd_end];
79
80 let pub_type = xml_utils::extract_attribute_value(pd_section, "pub-type")
81 .or_else(|| xml_utils::extract_attribute_value(pd_section, "date-type"));
82
83 let year = xml_utils::extract_text_between(pd_section, "<year>", "</year>")
84 .and_then(|y| y.parse::<u16>().ok());
85 let month = xml_utils::extract_text_between(pd_section, "<month>", "</month>")
86 .and_then(|m| m.parse::<u8>().ok());
87 let day = xml_utils::extract_text_between(pd_section, "<day>", "</day>")
88 .and_then(|d| d.parse::<u8>().ok());
89
90 dates.push(PublicationDate {
91 pub_type,
92 year,
93 month,
94 day,
95 });
96
97 pos = pd_end;
98 } else {
99 break;
100 }
101 }
102
103 dates
104}
105
106pub fn extract_pub_date(content: &str) -> String {
108 if let Some(year) = xml_utils::extract_text_between_ref(content, "<year>", "</year>") {
109 if let Some(month) = xml_utils::extract_text_between_ref(content, "<month>", "</month>") {
110 if let Some(day) = xml_utils::extract_text_between_ref(content, "<day>", "</day>") {
111 return format!(
112 "{}-{:02}-{:02}",
113 year,
114 month.parse::<u32>().unwrap_or(1),
115 day.parse::<u32>().unwrap_or(1)
116 );
117 }
118 return format!("{}-{:02}", year, month.parse::<u32>().unwrap_or(1));
119 }
120 return year.to_string();
121 }
122 "Unknown Date".to_string()
123}
124
125pub fn extract_doi(content: &str) -> Option<String> {
127 let mut pos = 0;
128 while let Some(id_start) = content[pos..].find(r#"<article-id pub-id-type="doi""#) {
129 let id_start = pos + id_start;
130 if let Some(content_start) = content[id_start..].find(">") {
131 let content_start = id_start + content_start + 1;
132 if let Some(content_end) = content[content_start..].find("</article-id>") {
133 let content_end = content_start + content_end;
134 return Some(content[content_start..content_end].trim().to_string());
135 }
136 }
137 pos = id_start + 1;
138 }
139 None
140}
141
142pub fn extract_pmid(content: &str) -> Option<String> {
144 let mut pos = 0;
145 while let Some(id_start) = content[pos..].find(r#"<article-id pub-id-type="pmid""#) {
146 let id_start = pos + id_start;
147 if let Some(content_start) = content[id_start..].find(">") {
148 let content_start = id_start + content_start + 1;
149 if let Some(content_end) = content[content_start..].find("</article-id>") {
150 let content_end = content_start + content_end;
151 return Some(content[content_start..content_end].trim().to_string());
152 }
153 }
154 pos = id_start + 1;
155 }
156 None
157}
158
159pub fn extract_article_type(content: &str) -> Option<String> {
161 if let Some(article_start) = content.find("<article")
163 && let Some(article_end) = content[article_start..].find(">")
164 {
165 let article_tag = &content[article_start..article_start + article_end];
166 if let Some(type_start) = article_tag.find("article-type=\"") {
167 let type_start = type_start + 14; if let Some(type_end) = article_tag[type_start..].find('"') {
169 return Some(article_tag[type_start..type_start + type_end].to_string());
170 }
171 }
172 }
173
174 xml_utils::extract_text_between(content, "<subject>", "</subject>")
176}
177
178pub fn extract_keywords(content: &str) -> Vec<String> {
180 let mut keywords = Vec::new();
181
182 if let Some(kwd_start) = content.find("<kwd-group")
183 && let Some(kwd_end) = content[kwd_start..].find("</kwd-group>")
184 {
185 let kwd_section = &content[kwd_start..kwd_start + kwd_end];
186
187 let mut pos = 0;
188 while let Some(kwd_start) = kwd_section[pos..].find("<kwd>") {
189 let kwd_start = pos + kwd_start + 5; if let Some(kwd_end) = kwd_section[kwd_start..].find("</kwd>") {
191 let raw_keyword = kwd_section[kwd_start..kwd_start + kwd_end].trim();
192 if raw_keyword.contains('<') {
194 let keyword = xml_utils::strip_xml_tags(raw_keyword);
195 if !keyword.is_empty() {
196 keywords.push(keyword);
197 }
198 } else if !raw_keyword.is_empty() {
199 keywords.push(raw_keyword.to_string());
200 }
201 pos = kwd_start + kwd_end;
202 } else {
203 break;
204 }
205 }
206 }
207
208 keywords
209}
210
211pub fn extract_funding(content: &str) -> Vec<FundingInfo> {
213 let mut funding = Vec::new();
214
215 if let Some(funding_start) = content.find("<funding-group>")
216 && let Some(funding_end) = content[funding_start..].find("</funding-group>")
217 {
218 let funding_section =
219 &content[funding_start..funding_start + funding_end + "</funding-group>".len()];
220
221 let statement = xml_utils::extract_text_between(
223 funding_section,
224 "<funding-statement>",
225 "</funding-statement>",
226 );
227
228 let mut pos = 0;
229 while let Some(award_start) = funding_section[pos..].find("<award-group") {
230 let award_start = pos + award_start;
231 if let Some(award_end) = funding_section[award_start..].find("</award-group>") {
232 let award_end = award_start + award_end;
233 let award_section = &funding_section[award_start..award_end];
234
235 let source = xml_utils::extract_text_between(
236 award_section,
237 "<funding-source>",
238 "</funding-source>",
239 )
240 .unwrap_or_else(|| "Unknown Source".to_string());
241
242 let award_id =
243 xml_utils::extract_text_between(award_section, "<award-id>", "</award-id>");
244
245 funding.push(FundingInfo {
246 source,
247 award_id,
248 statement: statement.clone(),
249 });
250 pos = award_end;
251 } else {
252 break;
253 }
254 }
255 }
256
257 funding
258}
259
260pub fn extract_conflict_of_interest(content: &str) -> Option<String> {
262 if let Some(fn_start) = content.find("<fn-group")
264 && let Some(fn_end) = content[fn_start..].find("</fn-group>")
265 {
266 let fn_section = &content[fn_start..fn_start + fn_end];
267
268 let mut pos = 0;
270 while let Some(fn_start) = fn_section[pos..].find("<fn") {
271 let fn_start = pos + fn_start;
272 if let Some(fn_end) = fn_section[fn_start..].find("</fn>") {
273 let fn_end = fn_start + fn_end;
274 let fn_content = &fn_section[fn_start..fn_end];
275
276 if (fn_content.contains("conflict") || fn_content.contains("competing"))
277 && let Some(p_start) = fn_content.find("<p>")
278 && let Some(p_end) = fn_content[p_start..].find("</p>")
279 {
280 let coi = &fn_content[p_start + 3..p_start + p_end];
281 return Some(xml_utils::strip_xml_tags(coi));
282 }
283 pos = fn_end;
284 } else {
285 break;
286 }
287 }
288 }
289
290 if let Some(coi_start) = content.find("<sec")
292 && let Some(coi_end) = content[coi_start..].find("</sec>")
293 {
294 let coi_section = &content[coi_start..coi_start + coi_end];
295 if (coi_section.contains("conflict") || coi_section.contains("competing"))
296 && let Some(title_start) = coi_section.find("<title>")
297 && let Some(title_end) = coi_section[title_start..].find("</title>")
298 {
299 let title = &coi_section[title_start + 7..title_start + title_end];
300 if (title.to_lowercase().contains("conflict")
301 || title.to_lowercase().contains("competing"))
302 && let Some(p_start) = coi_section.find("<p>")
303 && let Some(p_end) = coi_section[p_start..].find("</p>")
304 {
305 let coi = &coi_section[p_start + 3..p_start + p_end];
306 return Some(xml_utils::strip_xml_tags(coi));
307 }
308 }
309 }
310
311 None
312}
313
314pub fn extract_acknowledgments(content: &str) -> Option<String> {
318 xml_utils::extract_text_between(content, "<ack>", "</ack>")
319 .map(|ack| xml_utils::strip_xml_tags(&ack))
320 .map(|s| xml_utils::decode_xml_entities(&s).into_owned())
321}
322
323pub fn extract_data_availability(content: &str) -> Option<String> {
325 if let Some(data_start) = content.find("<sec")
327 && let Some(data_end) = content[data_start..].find("</sec>")
328 {
329 let data_section = &content[data_start..data_start + data_end];
330 if data_section.contains("data") && data_section.contains("availab") {
331 return Some(xml_utils::strip_xml_tags(data_section));
332 }
333 }
334
335 if let Some(supp_start) = content.find("<supplementary-material")
337 && let Some(supp_end) = content[supp_start..].find("</supplementary-material>")
338 {
339 let supp_section = &content[supp_start..supp_start + supp_end];
340 if supp_section.contains("data") && supp_section.contains("availab") {
341 return Some(xml_utils::strip_xml_tags(supp_section));
342 }
343 }
344
345 None
346}
347
348pub fn extract_supplementary_materials(content: &str) -> Vec<SupplementaryMaterial> {
350 let mut materials = Vec::new();
351
352 let mut pos = 0;
353 while let Some(supp_start) = content[pos..].find("<supplementary-material") {
354 let supp_start = pos + supp_start;
355 if let Some(supp_end) = content[supp_start..].find("</supplementary-material>") {
356 let supp_end = supp_start + supp_end;
357 let supp_content = &content[supp_start..supp_end];
358
359 let id = xml_utils::extract_attribute_value(supp_content, "id").unwrap_or_else(|| {
360 let supp_num = materials.len() + 1;
361 format!("supp_{supp_num}")
362 });
363
364 let label = xml_utils::extract_text_between(supp_content, "<label>", "</label>");
365 let caption = xml_utils::extract_text_between(supp_content, "<caption>", "</caption>")
366 .and_then(|caption_content| {
367 xml_utils::extract_text_between(&caption_content, "<title>", "</title>")
369 .or_else(|| {
370 Some(xml_utils::strip_xml_tags(&caption_content))
372 })
373 })
374 .unwrap_or_else(|| "No caption available".to_string());
375
376 let content_type = xml_utils::extract_attribute_value(supp_content, "content-type");
377 let href = xml_utils::extract_attribute_value(supp_content, "href")
378 .or_else(|| xml_utils::extract_attribute_value(supp_content, "xlink:href"))
379 .or_else(|| {
380 if let Some(media_start) = supp_content.find("<media") {
382 if let Some(media_end) = supp_content[media_start..].find(">") {
383 let media_tag = &supp_content[media_start..media_start + media_end + 1];
384 xml_utils::extract_attribute_value(media_tag, "xlink:href")
385 .or_else(|| xml_utils::extract_attribute_value(media_tag, "href"))
386 } else {
387 None
388 }
389 } else {
390 None
391 }
392 });
393
394 materials.push(SupplementaryMaterial {
395 id,
396 content_type,
397 title: Some(caption),
398 description: label,
399 href,
400 });
401 pos = supp_end;
402 } else {
403 break;
404 }
405 }
406
407 materials
408}
409
410pub fn extract_title(content: &str) -> String {
412 xml_utils::extract_text_between_ref(content, "<article-title>", "</article-title>")
413 .map(|s| s.to_string())
414 .unwrap_or_else(|| "Unknown Title".to_string())
415}
416
417pub fn extract_subtitle(content: &str) -> Option<String> {
419 let title_group = xml_utils::extract_element_content(content, "title-group")?;
420 xml_utils::extract_text_between(&title_group, "<subtitle>", "</subtitle>")
421 .map(|s| xml_utils::strip_xml_tags(&s))
422 .filter(|s| !s.is_empty())
423}
424
425pub fn extract_language(content: &str) -> Option<String> {
427 if let Some(article_start) = content.find("<article")
429 && let Some(article_end) = content[article_start..].find(">")
430 {
431 let article_tag = &content[article_start..article_start + article_end];
432 if let Some(lang) = xml_utils::extract_attribute_value(article_tag, "xml:lang") {
433 return Some(lang);
434 }
435 }
436 None
437}
438
439pub fn extract_article_ids(content: &str) -> Vec<(String, String)> {
441 let mut ids = Vec::new();
442
443 let id_tags = xml_utils::find_all_tags(content, "article-id");
444 for id_tag in id_tags {
445 if let Some(id_type) = xml_utils::extract_attribute_value(&id_tag, "pub-id-type")
446 && let Some(id_value) = xml_utils::extract_element_content(&id_tag, "article-id")
447 {
448 ids.push((id_type, id_value.trim().to_string()));
449 }
450 }
451
452 ids
453}
454
455pub fn extract_copyright(content: &str) -> Option<String> {
459 xml_utils::extract_text_between(content, "<copyright-statement>", "</copyright-statement>")
460 .or_else(|| {
461 xml_utils::extract_text_between(content, "<copyright-year>", "</copyright-year>")
462 })
463 .map(|s| xml_utils::decode_xml_entities(&s).into_owned())
464}
465
466pub fn extract_license(content: &str) -> Option<String> {
468 xml_utils::extract_element_content(content, "license")
469 .map(|license_content| xml_utils::strip_xml_tags(&license_content))
470}
471
472pub fn extract_abstract(content: &str) -> Option<String> {
477 let abstract_content = xml_utils::extract_element_content(content, "abstract")?;
478
479 let paragraphs = xml_utils::extract_all_text_between(&abstract_content, "<p", "</p>");
481 if paragraphs.is_empty() {
482 let text = xml_utils::strip_xml_tags(&abstract_content);
484 if text.is_empty() {
485 return None;
486 }
487 return Some(text);
488 }
489
490 let text = paragraphs
491 .iter()
492 .map(|p| {
493 let content = if let Some(gt_pos) = p.find('>') {
496 &p[gt_pos + 1..]
497 } else {
498 p
499 };
500 xml_utils::strip_xml_tags(content)
501 })
502 .filter(|s| !s.is_empty())
503 .collect::<Vec<_>>()
504 .join(" ");
505
506 if text.is_empty() { None } else { Some(text) }
507}
508
509pub fn extract_history_dates(content: &str) -> Vec<HistoryDate> {
513 let mut dates = Vec::new();
514
515 let history_content = match xml_utils::extract_element_content(content, "history") {
516 Some(c) => c,
517 None => return dates,
518 };
519
520 let date_tags = xml_utils::find_all_tags(&history_content, "date");
521 for date_tag in &date_tags {
522 let date_type = match xml_utils::extract_attribute_value(date_tag, "date-type") {
523 Some(dt) => dt,
524 None => continue,
525 };
526
527 let year = xml_utils::extract_text_between(date_tag, "<year>", "</year>")
528 .and_then(|y| y.parse::<u16>().ok());
529 let month = xml_utils::extract_text_between(date_tag, "<month>", "</month>")
530 .and_then(|m| m.parse::<u8>().ok());
531 let day = xml_utils::extract_text_between(date_tag, "<day>", "</day>")
532 .and_then(|d| d.parse::<u8>().ok());
533
534 dates.push(HistoryDate {
535 date_type,
536 year,
537 month,
538 day,
539 });
540 }
541
542 dates
543}
544
545pub fn extract_categories(content: &str) -> Vec<String> {
547 let mut categories = Vec::new();
548
549 let categories_content = match xml_utils::extract_element_content(content, "article-categories")
550 {
551 Some(c) => c,
552 None => return categories,
553 };
554
555 let subjects =
556 xml_utils::extract_all_text_between(&categories_content, "<subject>", "</subject>");
557 for subject in subjects {
558 let cleaned = xml_utils::strip_xml_tags(&subject);
559 if !cleaned.is_empty() {
560 categories.push(cleaned);
561 }
562 }
563
564 categories
565}
566
567pub fn extract_license_url(content: &str) -> Option<String> {
570 if let Some(license_start) = content.find("<license")
572 && let Some(tag_end) = content[license_start..].find('>')
573 {
574 let license_tag = &content[license_start..license_start + tag_end + 1];
575 let url = xml_utils::extract_attribute_value(license_tag, "xlink:href")
576 .or_else(|| xml_utils::extract_attribute_value(license_tag, "href"));
577 if url.is_some() {
578 return url;
579 }
580 }
581
582 xml_utils::extract_element_content(content, "ali:license_ref")
584 .map(|s| s.trim().to_string())
585 .filter(|s| !s.is_empty())
586}
587
588pub fn extract_fpage(content: &str) -> Option<String> {
592 xml_utils::extract_element_content(content, "fpage")
593 .map(|s| s.trim().to_string())
594 .filter(|s| !s.is_empty())
595}
596
597pub fn extract_lpage(content: &str) -> Option<String> {
601 xml_utils::extract_element_content(content, "lpage")
602 .map(|s| s.trim().to_string())
603 .filter(|s| !s.is_empty())
604}
605
606pub fn extract_elocation_id(content: &str) -> Option<String> {
608 xml_utils::extract_text_between(content, "<elocation-id>", "</elocation-id>")
609}
610
611#[cfg(test)]
612mod tests {
613 use super::*;
614
615 #[test]
616 fn test_extract_title() {
617 let content = r#"<article-title>Test Article Title</article-title>"#;
618 let title = extract_title(content);
619 assert_eq!(title, "Test Article Title");
620 }
621
622 #[test]
623 fn test_extract_doi() {
624 let content = r#"<article-id pub-id-type="doi">10.1234/test.doi</article-id>"#;
625 let doi = extract_doi(content);
626 assert_eq!(doi, Some("10.1234/test.doi".to_string()));
627 }
628
629 #[test]
630 fn test_extract_pmid() {
631 let content = r#"<article-id pub-id-type="pmid">12345678</article-id>"#;
632 let pmid = extract_pmid(content);
633 assert_eq!(pmid, Some("12345678".to_string()));
634 }
635
636 #[test]
637 fn test_extract_keywords() {
638 let content = r#"
639 <kwd-group>
640 <kwd>keyword1</kwd>
641 <kwd>keyword2</kwd>
642 <kwd>keyword3</kwd>
643 </kwd-group>
644 "#;
645
646 let keywords = extract_keywords(content);
647 assert_eq!(keywords, vec!["keyword1", "keyword2", "keyword3"]);
648 }
649
650 #[test]
651 fn test_extract_keywords_with_nested_tags() {
652 let content = r#"
653 <kwd-group>
654 <kwd><italic toggle="yes">Prevotella copri</italic></kwd>
655 <kwd>normal keyword</kwd>
656 <kwd><bold>important</bold> keyword</kwd>
657 </kwd-group>
658 "#;
659
660 let keywords = extract_keywords(content);
661 assert_eq!(
662 keywords,
663 vec!["Prevotella copri", "normal keyword", "important keyword"]
664 );
665 }
666
667 #[test]
668 fn test_extract_pub_date() {
669 let content_full = r#"<year>2023</year><month>12</month><day>25</day>"#;
670 assert_eq!(extract_pub_date(content_full), "2023-12-25");
671
672 let content_year_month = r#"<year>2023</year><month>12</month>"#;
673 assert_eq!(extract_pub_date(content_year_month), "2023-12");
674
675 let content_year_only = r#"<year>2023</year>"#;
676 assert_eq!(extract_pub_date(content_year_only), "2023");
677
678 let content_no_date = r#"<title>No date here</title>"#;
679 assert_eq!(extract_pub_date(content_no_date), "Unknown Date");
680 }
681
682 #[test]
683 fn test_extract_article_type() {
684 let content = r#"<article article-type="research-article">Content</article>"#;
685 let article_type = extract_article_type(content);
686 assert_eq!(article_type, Some("research-article".to_string()));
687 }
688
689 #[test]
690 fn test_extract_language() {
691 let content = r#"<article xml:lang="en">Content</article>"#;
692 let language = extract_language(content);
693 assert_eq!(language, Some("en".to_string()));
694 }
695
696 #[test]
697 fn test_extract_article_ids() {
698 let content = r#"
699 <article-id pub-id-type="doi">10.1234/test</article-id>
700 <article-id pub-id-type="pmid">12345</article-id>
701 <article-id pub-id-type="pmc">PMC123456</article-id>
702 "#;
703
704 let ids = extract_article_ids(content);
705 assert_eq!(ids.len(), 3);
706 assert!(ids.contains(&("doi".to_string(), "10.1234/test".to_string())));
707 assert!(ids.contains(&("pmid".to_string(), "12345".to_string())));
708 assert!(ids.contains(&("pmc".to_string(), "PMC123456".to_string())));
709 }
710
711 #[test]
712 fn test_extract_acknowledgments() {
713 let content = r#"<ack><p>We thank the contributors for their valuable input.</p></ack>"#;
714 let ack = extract_acknowledgments(content);
715 assert_eq!(
716 ack,
717 Some("We thank the contributors for their valuable input.".to_string())
718 );
719 }
720
721 #[test]
722 fn test_extract_abstract_simple() {
723 let content = r#"<abstract><p>This is a simple abstract.</p></abstract>"#;
724 let result = extract_abstract(content);
725 assert_eq!(result, Some("This is a simple abstract.".to_string()));
726 }
727
728 #[test]
729 fn test_extract_abstract_structured() {
730 let content = r#"
731 <abstract id="Abs1">
732 <sec>
733 <title>Background</title>
734 <p>Background text.</p>
735 </sec>
736 <sec>
737 <title>Methods</title>
738 <p>Methods text.</p>
739 </sec>
740 </abstract>
741 "#;
742 let result = extract_abstract(content);
743 assert!(result.is_some());
744 let text = result.unwrap();
745 assert!(text.contains("Background text."));
746 assert!(text.contains("Methods text."));
747 }
748
749 #[test]
750 fn test_extract_abstract_with_attributes() {
751 let content = r#"<abstract><p id="Par1">Text with id attribute.</p></abstract>"#;
752 let result = extract_abstract(content);
753 assert_eq!(result, Some("Text with id attribute.".to_string()));
754 }
755
756 #[test]
757 fn test_extract_abstract_missing() {
758 let content = r#"<title>No abstract here</title>"#;
759 let result = extract_abstract(content);
760 assert!(result.is_none());
761 }
762
763 #[test]
764 fn test_extract_history_dates() {
765 let content = r#"
766 <history>
767 <date date-type="received">
768 <day>21</day>
769 <month>2</month>
770 <year>2019</year>
771 </date>
772 <date date-type="accepted">
773 <day>23</day>
774 <month>4</month>
775 <year>2019</year>
776 </date>
777 </history>
778 "#;
779 let dates = extract_history_dates(content);
780 assert_eq!(dates.len(), 2);
781
782 assert_eq!(dates[0].date_type, "received");
783 assert_eq!(dates[0].year, Some(2019));
784 assert_eq!(dates[0].month, Some(2));
785 assert_eq!(dates[0].day, Some(21));
786
787 assert_eq!(dates[1].date_type, "accepted");
788 assert_eq!(dates[1].year, Some(2019));
789 assert_eq!(dates[1].month, Some(4));
790 assert_eq!(dates[1].day, Some(23));
791 }
792
793 #[test]
794 fn test_extract_history_dates_compact() {
795 let content = r#"
796 <history>
797<date date-type="received"><day>09</day><month>5</month><year>2023</year></date>
798<date date-type="accepted"><day>29</day><month>6</month><year>2023</year></date>
799</history>
800 "#;
801 let dates = extract_history_dates(content);
802 assert_eq!(dates.len(), 2);
803 assert_eq!(dates[0].date_type, "received");
804 assert_eq!(dates[0].year, Some(2023));
805 assert_eq!(dates[0].month, Some(5));
806 assert_eq!(dates[0].day, Some(9));
807 }
808
809 #[test]
810 fn test_extract_history_dates_missing() {
811 let content = r#"<article-meta><title>No history</title></article-meta>"#;
812 let dates = extract_history_dates(content);
813 assert!(dates.is_empty());
814 }
815
816 #[test]
817 fn test_extract_categories() {
818 let content = r#"
819 <article-categories>
820 <subj-group subj-group-type="heading">
821 <subject>Original Article</subject>
822 </subj-group>
823 </article-categories>
824 "#;
825 let categories = extract_categories(content);
826 assert_eq!(categories, vec!["Original Article"]);
827 }
828
829 #[test]
830 fn test_extract_categories_multiple() {
831 let content = r#"
832 <article-categories>
833 <subj-group subj-group-type="heading">
834 <subject>Research Article</subject>
835 </subj-group>
836 <subj-group subj-group-type="discipline">
837 <subject>Biology</subject>
838 <subject>Medicine</subject>
839 </subj-group>
840 </article-categories>
841 "#;
842 let categories = extract_categories(content);
843 assert_eq!(categories.len(), 3);
844 assert!(categories.contains(&"Research Article".to_string()));
845 assert!(categories.contains(&"Biology".to_string()));
846 assert!(categories.contains(&"Medicine".to_string()));
847 }
848
849 #[test]
850 fn test_extract_categories_missing() {
851 let content = r#"<title>No categories</title>"#;
852 let categories = extract_categories(content);
853 assert!(categories.is_empty());
854 }
855
856 #[test]
857 fn test_extract_license_url() {
858 let content = r#"<license license-type="open-access" xlink:href="http://creativecommons.org/licenses/by-nc-nd/3.0/"><license-p>Text</license-p></license>"#;
859 let url = extract_license_url(content);
860 assert_eq!(
861 url,
862 Some("http://creativecommons.org/licenses/by-nc-nd/3.0/".to_string())
863 );
864 }
865
866 #[test]
867 fn test_extract_license_url_missing() {
868 let content = r#"<license><license-p>No URL</license-p></license>"#;
869 let url = extract_license_url(content);
870 assert!(url.is_none());
871 }
872
873 #[test]
874 fn test_extract_fpage_lpage() {
875 let content = r#"<fpage>1865</fpage><lpage>1868</lpage>"#;
876 assert_eq!(extract_fpage(content), Some("1865".to_string()));
877 assert_eq!(extract_lpage(content), Some("1868".to_string()));
878 }
879
880 #[test]
881 fn test_extract_elocation_id() {
882 let content = r#"<elocation-id>e12345</elocation-id>"#;
883 assert_eq!(extract_elocation_id(content), Some("e12345".to_string()));
884 }
885
886 #[test]
887 fn test_extract_elocation_id_missing() {
888 let content = r#"<fpage>100</fpage>"#;
889 assert!(extract_elocation_id(content).is_none());
890 }
891}