pubmed_parser/pmc/parser/
metadata.rs

1use crate::common::{HistoryDate, PublicationDate};
2use crate::pmc::domain::{FundingInfo, JournalMeta, SupplementaryMaterial};
3
4use super::xml_utils;
5
6/// Extract journal metadata.
7///
8/// Returns a [`JournalMeta`] without volume/issue — those belong to the article level
9/// per the JATS DTD and are extracted separately via [`extract_volume`] / [`extract_issue`].
10pub fn extract_journal_info(content: &str) -> JournalMeta {
11    let title = xml_utils::extract_text_between(content, "<journal-title>", "</journal-title>")
12        .unwrap_or_else(|| "Unknown Journal".to_string());
13
14    let abbreviation = xml_utils::extract_text_between(
15        content,
16        "<journal-id journal-id-type=\"iso-abbrev\">",
17        "</journal-id>",
18    );
19
20    // Extract ISSNs
21    let mut issn_print = None;
22    let mut issn_electronic = None;
23    let mut pos = 0;
24    while let Some(issn_start) = content[pos..].find("<issn") {
25        let issn_start = pos + issn_start;
26        if let Some(issn_end) = content[issn_start..].find("</issn>") {
27            let issn_end = issn_start + issn_end;
28            let issn_section = &content[issn_start..issn_end];
29
30            if let Some(content_start) = issn_section.find(">") {
31                let issn_value = &issn_section[content_start + 1..];
32
33                if issn_section.contains("pub-type=\"epub\"") {
34                    issn_electronic = Some(issn_value.to_string());
35                } else if issn_section.contains("pub-type=\"ppub\"") {
36                    issn_print = Some(issn_value.to_string());
37                }
38            }
39            pos = issn_end;
40        } else {
41            break;
42        }
43    }
44
45    let publisher =
46        xml_utils::extract_text_between(content, "<publisher-name>", "</publisher-name>");
47
48    JournalMeta {
49        title,
50        abbreviation,
51        issn_print,
52        issn_electronic,
53        publisher,
54    }
55}
56
57/// Extract volume number from `<volume>` element.
58pub fn extract_volume(content: &str) -> Option<String> {
59    xml_utils::extract_text_between(content, "<volume>", "</volume>")
60}
61
62/// Extract issue number from `<issue>` element.
63pub fn extract_issue(content: &str) -> Option<String> {
64    xml_utils::extract_text_between(content, "<issue>", "</issue>")
65}
66
67/// Extract structured publication dates from `<pub-date>` elements.
68///
69/// Returns a `Vec<PublicationDate>` with `pub_type` attribute preserved.
70pub fn extract_pub_dates(content: &str) -> Vec<PublicationDate> {
71    let mut dates = Vec::new();
72
73    let mut pos = 0;
74    while let Some(pd_start) = content[pos..].find("<pub-date") {
75        let pd_start = pos + pd_start;
76        if let Some(pd_end) = content[pd_start..].find("</pub-date>") {
77            let pd_end = pd_start + pd_end + "</pub-date>".len();
78            let pd_section = &content[pd_start..pd_end];
79
80            let pub_type = xml_utils::extract_attribute_value(pd_section, "pub-type")
81                .or_else(|| xml_utils::extract_attribute_value(pd_section, "date-type"));
82
83            let year = xml_utils::extract_text_between(pd_section, "<year>", "</year>")
84                .and_then(|y| y.parse::<u16>().ok());
85            let month = xml_utils::extract_text_between(pd_section, "<month>", "</month>")
86                .and_then(|m| m.parse::<u8>().ok());
87            let day = xml_utils::extract_text_between(pd_section, "<day>", "</day>")
88                .and_then(|d| d.parse::<u8>().ok());
89
90            dates.push(PublicationDate {
91                pub_type,
92                year,
93                month,
94                day,
95            });
96
97            pos = pd_end;
98        } else {
99            break;
100        }
101    }
102
103    dates
104}
105
106/// Extract publication date in YYYY-MM-DD format
107pub fn extract_pub_date(content: &str) -> String {
108    if let Some(year) = xml_utils::extract_text_between_ref(content, "<year>", "</year>") {
109        if let Some(month) = xml_utils::extract_text_between_ref(content, "<month>", "</month>") {
110            if let Some(day) = xml_utils::extract_text_between_ref(content, "<day>", "</day>") {
111                return format!(
112                    "{}-{:02}-{:02}",
113                    year,
114                    month.parse::<u32>().unwrap_or(1),
115                    day.parse::<u32>().unwrap_or(1)
116                );
117            }
118            return format!("{}-{:02}", year, month.parse::<u32>().unwrap_or(1));
119        }
120        return year.to_string();
121    }
122    "Unknown Date".to_string()
123}
124
125/// Extract DOI from article metadata
126pub fn extract_doi(content: &str) -> Option<String> {
127    let mut pos = 0;
128    while let Some(id_start) = content[pos..].find(r#"<article-id pub-id-type="doi""#) {
129        let id_start = pos + id_start;
130        if let Some(content_start) = content[id_start..].find(">") {
131            let content_start = id_start + content_start + 1;
132            if let Some(content_end) = content[content_start..].find("</article-id>") {
133                let content_end = content_start + content_end;
134                return Some(content[content_start..content_end].trim().to_string());
135            }
136        }
137        pos = id_start + 1;
138    }
139    None
140}
141
142/// Extract PMID from article metadata
143pub fn extract_pmid(content: &str) -> Option<String> {
144    let mut pos = 0;
145    while let Some(id_start) = content[pos..].find(r#"<article-id pub-id-type="pmid""#) {
146        let id_start = pos + id_start;
147        if let Some(content_start) = content[id_start..].find(">") {
148            let content_start = id_start + content_start + 1;
149            if let Some(content_end) = content[content_start..].find("</article-id>") {
150                let content_end = content_start + content_end;
151                return Some(content[content_start..content_end].trim().to_string());
152            }
153        }
154        pos = id_start + 1;
155    }
156    None
157}
158
159/// Extract article type from article metadata
160pub fn extract_article_type(content: &str) -> Option<String> {
161    // Look for article-type attribute in article tag
162    if let Some(article_start) = content.find("<article")
163        && let Some(article_end) = content[article_start..].find(">")
164    {
165        let article_tag = &content[article_start..article_start + article_end];
166        if let Some(type_start) = article_tag.find("article-type=\"") {
167            let type_start = type_start + 14; // Length of "article-type=\""
168            if let Some(type_end) = article_tag[type_start..].find('"') {
169                return Some(article_tag[type_start..type_start + type_end].to_string());
170            }
171        }
172    }
173
174    // Fallback: look in article-categories
175    xml_utils::extract_text_between(content, "<subject>", "</subject>")
176}
177
178/// Extract keywords from article metadata
179pub fn extract_keywords(content: &str) -> Vec<String> {
180    let mut keywords = Vec::new();
181
182    if let Some(kwd_start) = content.find("<kwd-group")
183        && let Some(kwd_end) = content[kwd_start..].find("</kwd-group>")
184    {
185        let kwd_section = &content[kwd_start..kwd_start + kwd_end];
186
187        let mut pos = 0;
188        while let Some(kwd_start) = kwd_section[pos..].find("<kwd>") {
189            let kwd_start = pos + kwd_start + 5; // Length of "<kwd>"
190            if let Some(kwd_end) = kwd_section[kwd_start..].find("</kwd>") {
191                let raw_keyword = kwd_section[kwd_start..kwd_start + kwd_end].trim();
192                // Only strip XML tags if the keyword actually contains tags
193                if raw_keyword.contains('<') {
194                    let keyword = xml_utils::strip_xml_tags(raw_keyword);
195                    if !keyword.is_empty() {
196                        keywords.push(keyword);
197                    }
198                } else if !raw_keyword.is_empty() {
199                    keywords.push(raw_keyword.to_string());
200                }
201                pos = kwd_start + kwd_end;
202            } else {
203                break;
204            }
205        }
206    }
207
208    keywords
209}
210
211/// Extract funding information
212pub fn extract_funding(content: &str) -> Vec<FundingInfo> {
213    let mut funding = Vec::new();
214
215    if let Some(funding_start) = content.find("<funding-group>")
216        && let Some(funding_end) = content[funding_start..].find("</funding-group>")
217    {
218        let funding_section =
219            &content[funding_start..funding_start + funding_end + "</funding-group>".len()];
220
221        // Extract funding statement (applies to the funding group as a whole)
222        let statement = xml_utils::extract_text_between(
223            funding_section,
224            "<funding-statement>",
225            "</funding-statement>",
226        );
227
228        let mut pos = 0;
229        while let Some(award_start) = funding_section[pos..].find("<award-group") {
230            let award_start = pos + award_start;
231            if let Some(award_end) = funding_section[award_start..].find("</award-group>") {
232                let award_end = award_start + award_end;
233                let award_section = &funding_section[award_start..award_end];
234
235                let source = xml_utils::extract_text_between(
236                    award_section,
237                    "<funding-source>",
238                    "</funding-source>",
239                )
240                .unwrap_or_else(|| "Unknown Source".to_string());
241
242                let award_id =
243                    xml_utils::extract_text_between(award_section, "<award-id>", "</award-id>");
244
245                funding.push(FundingInfo {
246                    source,
247                    award_id,
248                    statement: statement.clone(),
249                });
250                pos = award_end;
251            } else {
252                break;
253            }
254        }
255    }
256
257    funding
258}
259
260/// Extract conflict of interest statement
261pub fn extract_conflict_of_interest(content: &str) -> Option<String> {
262    // Look for conflict of interest in fn-group
263    if let Some(fn_start) = content.find("<fn-group")
264        && let Some(fn_end) = content[fn_start..].find("</fn-group>")
265    {
266        let fn_section = &content[fn_start..fn_start + fn_end];
267
268        // Look for conflict or competing interest
269        let mut pos = 0;
270        while let Some(fn_start) = fn_section[pos..].find("<fn") {
271            let fn_start = pos + fn_start;
272            if let Some(fn_end) = fn_section[fn_start..].find("</fn>") {
273                let fn_end = fn_start + fn_end;
274                let fn_content = &fn_section[fn_start..fn_end];
275
276                if (fn_content.contains("conflict") || fn_content.contains("competing"))
277                    && let Some(p_start) = fn_content.find("<p>")
278                    && let Some(p_end) = fn_content[p_start..].find("</p>")
279                {
280                    let coi = &fn_content[p_start + 3..p_start + p_end];
281                    return Some(xml_utils::strip_xml_tags(coi));
282                }
283                pos = fn_end;
284            } else {
285                break;
286            }
287        }
288    }
289
290    // Look for conflict statement in dedicated section
291    if let Some(coi_start) = content.find("<sec")
292        && let Some(coi_end) = content[coi_start..].find("</sec>")
293    {
294        let coi_section = &content[coi_start..coi_start + coi_end];
295        if (coi_section.contains("conflict") || coi_section.contains("competing"))
296            && let Some(title_start) = coi_section.find("<title>")
297            && let Some(title_end) = coi_section[title_start..].find("</title>")
298        {
299            let title = &coi_section[title_start + 7..title_start + title_end];
300            if (title.to_lowercase().contains("conflict")
301                || title.to_lowercase().contains("competing"))
302                && let Some(p_start) = coi_section.find("<p>")
303                && let Some(p_end) = coi_section[p_start..].find("</p>")
304            {
305                let coi = &coi_section[p_start + 3..p_start + p_end];
306                return Some(xml_utils::strip_xml_tags(coi));
307            }
308        }
309    }
310
311    None
312}
313
314/// Extract acknowledgments
315///
316/// Strips XML tags and decodes XML entities (e.g., `&#231;` → `ç`).
317pub fn extract_acknowledgments(content: &str) -> Option<String> {
318    xml_utils::extract_text_between(content, "<ack>", "</ack>")
319        .map(|ack| xml_utils::strip_xml_tags(&ack))
320        .map(|s| xml_utils::decode_xml_entities(&s).into_owned())
321}
322
323/// Extract data availability statement
324pub fn extract_data_availability(content: &str) -> Option<String> {
325    // Look for data availability in dedicated section
326    if let Some(data_start) = content.find("<sec")
327        && let Some(data_end) = content[data_start..].find("</sec>")
328    {
329        let data_section = &content[data_start..data_start + data_end];
330        if data_section.contains("data") && data_section.contains("availab") {
331            return Some(xml_utils::strip_xml_tags(data_section));
332        }
333    }
334
335    // Look for data availability statement in supplementary material
336    if let Some(supp_start) = content.find("<supplementary-material")
337        && let Some(supp_end) = content[supp_start..].find("</supplementary-material>")
338    {
339        let supp_section = &content[supp_start..supp_start + supp_end];
340        if supp_section.contains("data") && supp_section.contains("availab") {
341            return Some(xml_utils::strip_xml_tags(supp_section));
342        }
343    }
344
345    None
346}
347
348/// Extract supplementary materials
349pub fn extract_supplementary_materials(content: &str) -> Vec<SupplementaryMaterial> {
350    let mut materials = Vec::new();
351
352    let mut pos = 0;
353    while let Some(supp_start) = content[pos..].find("<supplementary-material") {
354        let supp_start = pos + supp_start;
355        if let Some(supp_end) = content[supp_start..].find("</supplementary-material>") {
356            let supp_end = supp_start + supp_end;
357            let supp_content = &content[supp_start..supp_end];
358
359            let id = xml_utils::extract_attribute_value(supp_content, "id").unwrap_or_else(|| {
360                let supp_num = materials.len() + 1;
361                format!("supp_{supp_num}")
362            });
363
364            let label = xml_utils::extract_text_between(supp_content, "<label>", "</label>");
365            let caption = xml_utils::extract_text_between(supp_content, "<caption>", "</caption>")
366                .and_then(|caption_content| {
367                    // First try to extract just the title from caption
368                    xml_utils::extract_text_between(&caption_content, "<title>", "</title>")
369                        .or_else(|| {
370                            // If no title, extract all content and strip tags
371                            Some(xml_utils::strip_xml_tags(&caption_content))
372                        })
373                })
374                .unwrap_or_else(|| "No caption available".to_string());
375
376            let content_type = xml_utils::extract_attribute_value(supp_content, "content-type");
377            let href = xml_utils::extract_attribute_value(supp_content, "href")
378                .or_else(|| xml_utils::extract_attribute_value(supp_content, "xlink:href"))
379                .or_else(|| {
380                    // Look for href in nested media tags
381                    if let Some(media_start) = supp_content.find("<media") {
382                        if let Some(media_end) = supp_content[media_start..].find(">") {
383                            let media_tag = &supp_content[media_start..media_start + media_end + 1];
384                            xml_utils::extract_attribute_value(media_tag, "xlink:href")
385                                .or_else(|| xml_utils::extract_attribute_value(media_tag, "href"))
386                        } else {
387                            None
388                        }
389                    } else {
390                        None
391                    }
392                });
393
394            materials.push(SupplementaryMaterial {
395                id,
396                content_type,
397                title: Some(caption),
398                description: label,
399                href,
400            });
401            pos = supp_end;
402        } else {
403            break;
404        }
405    }
406
407    materials
408}
409
410/// Extract article title
411pub fn extract_title(content: &str) -> String {
412    xml_utils::extract_text_between_ref(content, "<article-title>", "</article-title>")
413        .map(|s| s.to_string())
414        .unwrap_or_else(|| "Unknown Title".to_string())
415}
416
417/// Extract article subtitle from `<title-group>/<subtitle>`
418pub fn extract_subtitle(content: &str) -> Option<String> {
419    let title_group = xml_utils::extract_element_content(content, "title-group")?;
420    xml_utils::extract_text_between(&title_group, "<subtitle>", "</subtitle>")
421        .map(|s| xml_utils::strip_xml_tags(&s))
422        .filter(|s| !s.is_empty())
423}
424
425/// Extract article language
426pub fn extract_language(content: &str) -> Option<String> {
427    // Look for language in article tag
428    if let Some(article_start) = content.find("<article")
429        && let Some(article_end) = content[article_start..].find(">")
430    {
431        let article_tag = &content[article_start..article_start + article_end];
432        if let Some(lang) = xml_utils::extract_attribute_value(article_tag, "xml:lang") {
433            return Some(lang);
434        }
435    }
436    None
437}
438
439/// Extract article identifiers (DOI, PMID, PMC ID, etc.)
440pub fn extract_article_ids(content: &str) -> Vec<(String, String)> {
441    let mut ids = Vec::new();
442
443    let id_tags = xml_utils::find_all_tags(content, "article-id");
444    for id_tag in id_tags {
445        if let Some(id_type) = xml_utils::extract_attribute_value(&id_tag, "pub-id-type")
446            && let Some(id_value) = xml_utils::extract_element_content(&id_tag, "article-id")
447        {
448            ids.push((id_type, id_value.trim().to_string()));
449        }
450    }
451
452    ids
453}
454
455/// Extract copyright information
456///
457/// Decodes XML entities (e.g., `&#169;` → `©`).
458pub fn extract_copyright(content: &str) -> Option<String> {
459    xml_utils::extract_text_between(content, "<copyright-statement>", "</copyright-statement>")
460        .or_else(|| {
461            xml_utils::extract_text_between(content, "<copyright-year>", "</copyright-year>")
462        })
463        .map(|s| xml_utils::decode_xml_entities(&s).into_owned())
464}
465
466/// Extract license information
467pub fn extract_license(content: &str) -> Option<String> {
468    xml_utils::extract_element_content(content, "license")
469        .map(|license_content| xml_utils::strip_xml_tags(&license_content))
470}
471
472/// Extract abstract text from article metadata
473///
474/// Handles both simple abstracts (`<abstract><p>...</p></abstract>`)
475/// and structured abstracts with sections (`<abstract><sec><title>Background</title><p>...</p></sec>...</abstract>`).
476pub fn extract_abstract(content: &str) -> Option<String> {
477    let abstract_content = xml_utils::extract_element_content(content, "abstract")?;
478
479    // Collect all paragraph text, stripping XML tags
480    let paragraphs = xml_utils::extract_all_text_between(&abstract_content, "<p", "</p>");
481    if paragraphs.is_empty() {
482        // Fallback: strip all tags from abstract content
483        let text = xml_utils::strip_xml_tags(&abstract_content);
484        if text.is_empty() {
485            return None;
486        }
487        return Some(text);
488    }
489
490    let text = paragraphs
491        .iter()
492        .map(|p| {
493            // Each paragraph may start with attributes like `id="Par1">`
494            // Find the closing `>` of the opening tag remnant and take content after it
495            let content = if let Some(gt_pos) = p.find('>') {
496                &p[gt_pos + 1..]
497            } else {
498                p
499            };
500            xml_utils::strip_xml_tags(content)
501        })
502        .filter(|s| !s.is_empty())
503        .collect::<Vec<_>>()
504        .join(" ");
505
506    if text.is_empty() { None } else { Some(text) }
507}
508
509/// Extract publication history dates from `<history>` element
510///
511/// Parses `<date date-type="received">`, `<date date-type="accepted">`, etc.
512pub fn extract_history_dates(content: &str) -> Vec<HistoryDate> {
513    let mut dates = Vec::new();
514
515    let history_content = match xml_utils::extract_element_content(content, "history") {
516        Some(c) => c,
517        None => return dates,
518    };
519
520    let date_tags = xml_utils::find_all_tags(&history_content, "date");
521    for date_tag in &date_tags {
522        let date_type = match xml_utils::extract_attribute_value(date_tag, "date-type") {
523            Some(dt) => dt,
524            None => continue,
525        };
526
527        let year = xml_utils::extract_text_between(date_tag, "<year>", "</year>")
528            .and_then(|y| y.parse::<u16>().ok());
529        let month = xml_utils::extract_text_between(date_tag, "<month>", "</month>")
530            .and_then(|m| m.parse::<u8>().ok());
531        let day = xml_utils::extract_text_between(date_tag, "<day>", "</day>")
532            .and_then(|d| d.parse::<u8>().ok());
533
534        dates.push(HistoryDate {
535            date_type,
536            year,
537            month,
538            day,
539        });
540    }
541
542    dates
543}
544
545/// Extract article categories from `<article-categories>/<subj-group>/<subject>`
546pub fn extract_categories(content: &str) -> Vec<String> {
547    let mut categories = Vec::new();
548
549    let categories_content = match xml_utils::extract_element_content(content, "article-categories")
550    {
551        Some(c) => c,
552        None => return categories,
553    };
554
555    let subjects =
556        xml_utils::extract_all_text_between(&categories_content, "<subject>", "</subject>");
557    for subject in subjects {
558        let cleaned = xml_utils::strip_xml_tags(&subject);
559        if !cleaned.is_empty() {
560            categories.push(cleaned);
561        }
562    }
563
564    categories
565}
566
567/// Extract license URL from `<license xlink:href="...">` attribute
568/// or from `<ali:license_ref>` element content
569pub fn extract_license_url(content: &str) -> Option<String> {
570    // Try <license xlink:href="..."> first
571    if let Some(license_start) = content.find("<license")
572        && let Some(tag_end) = content[license_start..].find('>')
573    {
574        let license_tag = &content[license_start..license_start + tag_end + 1];
575        let url = xml_utils::extract_attribute_value(license_tag, "xlink:href")
576            .or_else(|| xml_utils::extract_attribute_value(license_tag, "href"));
577        if url.is_some() {
578            return url;
579        }
580    }
581
582    // Fallback: extract URL from <ali:license_ref> element content
583    xml_utils::extract_element_content(content, "ali:license_ref")
584        .map(|s| s.trim().to_string())
585        .filter(|s| !s.is_empty())
586}
587
588/// Extract first page number from `<fpage>` element
589///
590/// Handles `<fpage>` with or without attributes (e.g., `<fpage seq="b">54</fpage>`).
591pub fn extract_fpage(content: &str) -> Option<String> {
592    xml_utils::extract_element_content(content, "fpage")
593        .map(|s| s.trim().to_string())
594        .filter(|s| !s.is_empty())
595}
596
597/// Extract last page number from `<lpage>` element
598///
599/// Handles `<lpage>` with or without attributes.
600pub fn extract_lpage(content: &str) -> Option<String> {
601    xml_utils::extract_element_content(content, "lpage")
602        .map(|s| s.trim().to_string())
603        .filter(|s| !s.is_empty())
604}
605
606/// Extract electronic location identifier from `<elocation-id>` element
607pub fn extract_elocation_id(content: &str) -> Option<String> {
608    xml_utils::extract_text_between(content, "<elocation-id>", "</elocation-id>")
609}
610
611#[cfg(test)]
612mod tests {
613    use super::*;
614
615    #[test]
616    fn test_extract_title() {
617        let content = r#"<article-title>Test Article Title</article-title>"#;
618        let title = extract_title(content);
619        assert_eq!(title, "Test Article Title");
620    }
621
622    #[test]
623    fn test_extract_doi() {
624        let content = r#"<article-id pub-id-type="doi">10.1234/test.doi</article-id>"#;
625        let doi = extract_doi(content);
626        assert_eq!(doi, Some("10.1234/test.doi".to_string()));
627    }
628
629    #[test]
630    fn test_extract_pmid() {
631        let content = r#"<article-id pub-id-type="pmid">12345678</article-id>"#;
632        let pmid = extract_pmid(content);
633        assert_eq!(pmid, Some("12345678".to_string()));
634    }
635
636    #[test]
637    fn test_extract_keywords() {
638        let content = r#"
639        <kwd-group>
640            <kwd>keyword1</kwd>
641            <kwd>keyword2</kwd>
642            <kwd>keyword3</kwd>
643        </kwd-group>
644        "#;
645
646        let keywords = extract_keywords(content);
647        assert_eq!(keywords, vec!["keyword1", "keyword2", "keyword3"]);
648    }
649
650    #[test]
651    fn test_extract_keywords_with_nested_tags() {
652        let content = r#"
653        <kwd-group>
654            <kwd><italic toggle="yes">Prevotella copri</italic></kwd>
655            <kwd>normal keyword</kwd>
656            <kwd><bold>important</bold> keyword</kwd>
657        </kwd-group>
658        "#;
659
660        let keywords = extract_keywords(content);
661        assert_eq!(
662            keywords,
663            vec!["Prevotella copri", "normal keyword", "important keyword"]
664        );
665    }
666
667    #[test]
668    fn test_extract_pub_date() {
669        let content_full = r#"<year>2023</year><month>12</month><day>25</day>"#;
670        assert_eq!(extract_pub_date(content_full), "2023-12-25");
671
672        let content_year_month = r#"<year>2023</year><month>12</month>"#;
673        assert_eq!(extract_pub_date(content_year_month), "2023-12");
674
675        let content_year_only = r#"<year>2023</year>"#;
676        assert_eq!(extract_pub_date(content_year_only), "2023");
677
678        let content_no_date = r#"<title>No date here</title>"#;
679        assert_eq!(extract_pub_date(content_no_date), "Unknown Date");
680    }
681
682    #[test]
683    fn test_extract_article_type() {
684        let content = r#"<article article-type="research-article">Content</article>"#;
685        let article_type = extract_article_type(content);
686        assert_eq!(article_type, Some("research-article".to_string()));
687    }
688
689    #[test]
690    fn test_extract_language() {
691        let content = r#"<article xml:lang="en">Content</article>"#;
692        let language = extract_language(content);
693        assert_eq!(language, Some("en".to_string()));
694    }
695
696    #[test]
697    fn test_extract_article_ids() {
698        let content = r#"
699        <article-id pub-id-type="doi">10.1234/test</article-id>
700        <article-id pub-id-type="pmid">12345</article-id>
701        <article-id pub-id-type="pmc">PMC123456</article-id>
702        "#;
703
704        let ids = extract_article_ids(content);
705        assert_eq!(ids.len(), 3);
706        assert!(ids.contains(&("doi".to_string(), "10.1234/test".to_string())));
707        assert!(ids.contains(&("pmid".to_string(), "12345".to_string())));
708        assert!(ids.contains(&("pmc".to_string(), "PMC123456".to_string())));
709    }
710
711    #[test]
712    fn test_extract_acknowledgments() {
713        let content = r#"<ack><p>We thank the contributors for their valuable input.</p></ack>"#;
714        let ack = extract_acknowledgments(content);
715        assert_eq!(
716            ack,
717            Some("We thank the contributors for their valuable input.".to_string())
718        );
719    }
720
721    #[test]
722    fn test_extract_abstract_simple() {
723        let content = r#"<abstract><p>This is a simple abstract.</p></abstract>"#;
724        let result = extract_abstract(content);
725        assert_eq!(result, Some("This is a simple abstract.".to_string()));
726    }
727
728    #[test]
729    fn test_extract_abstract_structured() {
730        let content = r#"
731        <abstract id="Abs1">
732            <sec>
733                <title>Background</title>
734                <p>Background text.</p>
735            </sec>
736            <sec>
737                <title>Methods</title>
738                <p>Methods text.</p>
739            </sec>
740        </abstract>
741        "#;
742        let result = extract_abstract(content);
743        assert!(result.is_some());
744        let text = result.unwrap();
745        assert!(text.contains("Background text."));
746        assert!(text.contains("Methods text."));
747    }
748
749    #[test]
750    fn test_extract_abstract_with_attributes() {
751        let content = r#"<abstract><p id="Par1">Text with id attribute.</p></abstract>"#;
752        let result = extract_abstract(content);
753        assert_eq!(result, Some("Text with id attribute.".to_string()));
754    }
755
756    #[test]
757    fn test_extract_abstract_missing() {
758        let content = r#"<title>No abstract here</title>"#;
759        let result = extract_abstract(content);
760        assert!(result.is_none());
761    }
762
763    #[test]
764    fn test_extract_history_dates() {
765        let content = r#"
766        <history>
767            <date date-type="received">
768                <day>21</day>
769                <month>2</month>
770                <year>2019</year>
771            </date>
772            <date date-type="accepted">
773                <day>23</day>
774                <month>4</month>
775                <year>2019</year>
776            </date>
777        </history>
778        "#;
779        let dates = extract_history_dates(content);
780        assert_eq!(dates.len(), 2);
781
782        assert_eq!(dates[0].date_type, "received");
783        assert_eq!(dates[0].year, Some(2019));
784        assert_eq!(dates[0].month, Some(2));
785        assert_eq!(dates[0].day, Some(21));
786
787        assert_eq!(dates[1].date_type, "accepted");
788        assert_eq!(dates[1].year, Some(2019));
789        assert_eq!(dates[1].month, Some(4));
790        assert_eq!(dates[1].day, Some(23));
791    }
792
793    #[test]
794    fn test_extract_history_dates_compact() {
795        let content = r#"
796        <history>
797<date date-type="received"><day>09</day><month>5</month><year>2023</year></date>
798<date date-type="accepted"><day>29</day><month>6</month><year>2023</year></date>
799</history>
800        "#;
801        let dates = extract_history_dates(content);
802        assert_eq!(dates.len(), 2);
803        assert_eq!(dates[0].date_type, "received");
804        assert_eq!(dates[0].year, Some(2023));
805        assert_eq!(dates[0].month, Some(5));
806        assert_eq!(dates[0].day, Some(9));
807    }
808
809    #[test]
810    fn test_extract_history_dates_missing() {
811        let content = r#"<article-meta><title>No history</title></article-meta>"#;
812        let dates = extract_history_dates(content);
813        assert!(dates.is_empty());
814    }
815
816    #[test]
817    fn test_extract_categories() {
818        let content = r#"
819        <article-categories>
820            <subj-group subj-group-type="heading">
821                <subject>Original Article</subject>
822            </subj-group>
823        </article-categories>
824        "#;
825        let categories = extract_categories(content);
826        assert_eq!(categories, vec!["Original Article"]);
827    }
828
829    #[test]
830    fn test_extract_categories_multiple() {
831        let content = r#"
832        <article-categories>
833            <subj-group subj-group-type="heading">
834                <subject>Research Article</subject>
835            </subj-group>
836            <subj-group subj-group-type="discipline">
837                <subject>Biology</subject>
838                <subject>Medicine</subject>
839            </subj-group>
840        </article-categories>
841        "#;
842        let categories = extract_categories(content);
843        assert_eq!(categories.len(), 3);
844        assert!(categories.contains(&"Research Article".to_string()));
845        assert!(categories.contains(&"Biology".to_string()));
846        assert!(categories.contains(&"Medicine".to_string()));
847    }
848
849    #[test]
850    fn test_extract_categories_missing() {
851        let content = r#"<title>No categories</title>"#;
852        let categories = extract_categories(content);
853        assert!(categories.is_empty());
854    }
855
856    #[test]
857    fn test_extract_license_url() {
858        let content = r#"<license license-type="open-access" xlink:href="http://creativecommons.org/licenses/by-nc-nd/3.0/"><license-p>Text</license-p></license>"#;
859        let url = extract_license_url(content);
860        assert_eq!(
861            url,
862            Some("http://creativecommons.org/licenses/by-nc-nd/3.0/".to_string())
863        );
864    }
865
866    #[test]
867    fn test_extract_license_url_missing() {
868        let content = r#"<license><license-p>No URL</license-p></license>"#;
869        let url = extract_license_url(content);
870        assert!(url.is_none());
871    }
872
873    #[test]
874    fn test_extract_fpage_lpage() {
875        let content = r#"<fpage>1865</fpage><lpage>1868</lpage>"#;
876        assert_eq!(extract_fpage(content), Some("1865".to_string()));
877        assert_eq!(extract_lpage(content), Some("1868".to_string()));
878    }
879
880    #[test]
881    fn test_extract_elocation_id() {
882        let content = r#"<elocation-id>e12345</elocation-id>"#;
883        assert_eq!(extract_elocation_id(content), Some("e12345".to_string()));
884    }
885
886    #[test]
887    fn test_extract_elocation_id_missing() {
888        let content = r#"<fpage>100</fpage>"#;
889        assert!(extract_elocation_id(content).is_none());
890    }
891}
pubmed_parser/pmc/parser/metadata.rs

pubmed_parser/pmc/parser/
metadata.rs