pubmed_parser/pmc/parser/
section.rs

1use crate::pmc::domain::{Figure, Section, Table};
2
3use super::reader_utils::{get_attr, make_reader, read_text_content, skip_element};
4use super::xml_utils;
5use quick_xml::events::Event;
6use quick_xml::name::QName;
7
8/// Extract all sections from PMC XML content
9pub fn extract_sections_enhanced(content: &str) -> Vec<Section> {
10    let mut sections = Vec::new();
11
12    // Extract abstract first
13    if let Some(abstract_section) = extract_abstract_section(content) {
14        sections.push(abstract_section);
15    }
16
17    // Extract body sections with Reader-based parsing
18    if let Some(body_start) = content.find("<body>")
19        && let Some(body_end) = content[body_start..].find("</body>")
20    {
21        let body_content = &content[body_start + 6..body_start + body_end];
22        sections.extend(extract_body_sections(body_content));
23    }
24
25    // Extract figures from floats-group and add to first section
26    if let Some(floats_start) = content.find("<floats-group>")
27        && let Some(floats_end) = content[floats_start..].find("</floats-group>")
28    {
29        let floats_content =
30            &content[floats_start..floats_start + floats_end + "</floats-group>".len()];
31        let float_figures = extract_figures_from_content(floats_content);
32        if !float_figures.is_empty() {
33            if let Some(first_section) = sections.first_mut() {
34                first_section.figures.extend(float_figures);
35            } else {
36                sections.push(Section {
37                    id: None,
38                    section_type: Some("figures".to_string()),
39                    label: None,
40                    title: Some("Figures".to_string()),
41                    content: String::new(),
42                    subsections: Vec::new(),
43                    figures: float_figures,
44                    tables: Vec::new(),
45                    formulas: Vec::new(),
46                });
47            }
48        }
49    }
50
51    sections
52}
53
54/// Extract abstract section using Reader for text, Reader scan for figures/tables
55fn extract_abstract_section(content: &str) -> Option<Section> {
56    let abstract_start = content.find("<abstract")?;
57    let abstract_end_offset = content[abstract_start..].find("</abstract>")?;
58    let abstract_xml =
59        &content[abstract_start..abstract_start + abstract_end_offset + "</abstract>".len()];
60
61    // Extract text content using Reader
62    let mut reader = make_reader(abstract_xml);
63    let mut buf = Vec::new();
64    let mut text_parts = Vec::new();
65    let mut in_abstract = false;
66
67    loop {
68        let action = match reader.read_event_into(&mut buf) {
69            Ok(Event::Start(ref e)) => match e.name().as_ref() {
70                b"abstract" => SectionAction::EnterAbstract,
71                b"p" if in_abstract => SectionAction::ReadParagraph,
72                b"title" if in_abstract => SectionAction::SkipTitle,
73                _ => SectionAction::Continue,
74            },
75            Ok(Event::End(ref e)) if e.name().as_ref() == b"abstract" => SectionAction::Break,
76            Ok(Event::Eof) => SectionAction::Break,
77            Err(_) => SectionAction::Break,
78            _ => SectionAction::Continue,
79        };
80        buf.clear();
81
82        match action {
83            SectionAction::EnterAbstract => in_abstract = true,
84            SectionAction::ReadParagraph => {
85                if let Ok(text) = read_text_content(&mut reader, b"p", &mut buf) {
86                    let trimmed = text.trim().to_string();
87                    if !trimmed.is_empty() {
88                        text_parts.push(trimmed);
89                    }
90                }
91            }
92            SectionAction::SkipTitle => {
93                let _ = read_text_content(&mut reader, b"title", &mut buf);
94            }
95            SectionAction::Break => break,
96            _ => {}
97        }
98    }
99
100    // Extract figures and tables from the raw abstract content (handles inline figs)
101    let figures = extract_figures_from_content(abstract_xml);
102    let tables = extract_tables_from_content(abstract_xml);
103
104    let clean_content = text_parts.join("\n");
105    if !clean_content.is_empty() {
106        Some(Section {
107            id: None,
108            section_type: Some("abstract".to_string()),
109            label: None,
110            title: Some("Abstract".to_string()),
111            content: clean_content,
112            subsections: Vec::new(),
113            figures,
114            tables,
115            formulas: Vec::new(),
116        })
117    } else {
118        None
119    }
120}
121
122/// Simple action enum to work around borrow checker (extract data from event before clearing buf)
123enum SectionAction {
124    Continue,
125    Break,
126    EnterAbstract,
127    ReadParagraph,
128    ReadSection(Option<String>),
129    ReadBodyParagraph,
130    ReadFigure(FigAttrs),
131    ReadTable(TableAttrs),
132    /// Extract text content from a block-level element (list, def-list, formula, etc.)
133    ReadTextElement(Vec<u8>),
134    SkipTitle,
135    SkipTag(Vec<u8>),
136}
137
138/// Extract body sections using Reader with depth-aware `<sec>` parsing
139fn extract_body_sections(content: &str) -> Vec<Section> {
140    let mut reader = make_reader(content);
141    let mut buf = Vec::new();
142    let mut sections = Vec::new();
143    let mut has_sec_tags = false;
144    let mut body_paragraphs = Vec::new();
145    let mut body_figures = Vec::new();
146    let mut body_tables = Vec::new();
147
148    loop {
149        let action = match reader.read_event_into(&mut buf) {
150            Ok(Event::Start(ref e)) => match e.name().as_ref() {
151                b"sec" => SectionAction::ReadSection(get_attr(e, b"id")),
152                b"p" if !has_sec_tags => SectionAction::ReadBodyParagraph,
153                b"fig" if !has_sec_tags => SectionAction::ReadFigure(FigAttrs {
154                    id: get_attr(e, b"id"),
155                    fig_type: get_attr(e, b"fig-type"),
156                }),
157                b"table-wrap" if !has_sec_tags => SectionAction::ReadTable(TableAttrs {
158                    id: get_attr(e, b"id"),
159                }),
160                // Block-level elements per JATS %para-level; — extract text in no-sec bodies
161                b"list"
162                | b"def-list"
163                | b"disp-formula"
164                | b"disp-formula-group"
165                | b"disp-quote"
166                | b"boxed-text"
167                | b"code"
168                | b"preformat"
169                | b"media"
170                | b"supplementary-material"
171                | b"speech"
172                | b"statement"
173                | b"verse-group"
174                | b"array"
175                | b"graphic"
176                | b"fn-group"
177                    if !has_sec_tags =>
178                {
179                    SectionAction::ReadTextElement(e.name().as_ref().to_vec())
180                }
181                _ => SectionAction::Continue,
182            },
183            Ok(Event::Eof) => SectionAction::Break,
184            Err(_) => SectionAction::Break,
185            _ => SectionAction::Continue,
186        };
187        buf.clear();
188
189        match action {
190            SectionAction::ReadSection(id) => {
191                has_sec_tags = true;
192                if let Some(section) = parse_section_from_body(&mut reader, id, &mut buf) {
193                    sections.push(section);
194                }
195            }
196            SectionAction::ReadBodyParagraph => {
197                let (text, inline_figs, inline_tables) =
198                    read_paragraph_with_inline(&mut reader, &mut buf);
199                if !text.is_empty() {
200                    body_paragraphs.push(text);
201                }
202                body_figures.extend(inline_figs);
203                body_tables.extend(inline_tables);
204            }
205            SectionAction::ReadFigure(attrs) => {
206                if let Some(fig) = parse_figure_inner(&mut reader, attrs, &mut buf) {
207                    body_figures.push(fig);
208                }
209            }
210            SectionAction::ReadTable(attrs) => {
211                if let Some(table) = parse_table_inner(&mut reader, attrs, &mut buf) {
212                    body_tables.push(table);
213                }
214            }
215            SectionAction::ReadTextElement(tag) => {
216                if let Ok(text) = read_text_content(&mut reader, &tag, &mut buf) {
217                    let trimmed = text.trim().to_string();
218                    if !trimmed.is_empty() {
219                        body_paragraphs.push(trimmed);
220                    }
221                }
222            }
223            SectionAction::Break => break,
224            _ => {}
225        }
226    }
227
228    // If no sections found, create a body section from paragraphs
229    if sections.is_empty() && !body_paragraphs.is_empty() {
230        let text = body_paragraphs.join("\n");
231        sections.push(Section {
232            id: None,
233            section_type: Some("body".to_string()),
234            label: None,
235            title: None,
236            content: text,
237            subsections: Vec::new(),
238            figures: body_figures,
239            tables: body_tables,
240            formulas: Vec::new(),
241        });
242    }
243
244    sections
245}
246
247/// Parse a single `<sec>` element using Reader for structure.
248/// The reader has just consumed `Event::Start` for `<sec>`.
249///
250/// Uses a single Reader pass for text, figures, tables, and subsections.
251/// Figures and tables are detected both as direct children of `<sec>` and
252/// inline within `<p>` tags via `read_paragraph_with_inline`.
253fn parse_section_from_body(
254    reader: &mut quick_xml::Reader<&[u8]>,
255    id: Option<String>,
256    buf: &mut Vec<u8>,
257) -> Option<Section> {
258    let mut title: Option<String> = None;
259    let mut content_parts: Vec<String> = Vec::new();
260    let mut subsections = Vec::new();
261    let mut figures = Vec::new();
262    let mut tables = Vec::new();
263
264    loop {
265        let action = match reader.read_event_into(buf) {
266            Ok(Event::Start(ref e)) => match e.name().as_ref() {
267                b"title" => SectionAction::SkipTitle,
268                b"p" => SectionAction::ReadParagraph,
269                b"sec" => SectionAction::ReadSection(get_attr(e, b"id")),
270                b"fig" => SectionAction::ReadFigure(FigAttrs {
271                    id: get_attr(e, b"id"),
272                    fig_type: get_attr(e, b"fig-type"),
273                }),
274                b"table-wrap" => SectionAction::ReadTable(TableAttrs {
275                    id: get_attr(e, b"id"),
276                }),
277                // Block-level elements per JATS %para-level; — extract text instead of skipping
278                b"list"
279                | b"def-list"
280                | b"disp-formula"
281                | b"disp-formula-group"
282                | b"disp-quote"
283                | b"boxed-text"
284                | b"code"
285                | b"preformat"
286                | b"media"
287                | b"supplementary-material"
288                | b"speech"
289                | b"statement"
290                | b"verse-group"
291                | b"array"
292                | b"graphic"
293                | b"fn-group" => SectionAction::ReadTextElement(e.name().as_ref().to_vec()),
294                other => SectionAction::SkipTag(other.to_vec()),
295            },
296            Ok(Event::End(ref e)) if e.name().as_ref() == b"sec" => SectionAction::Break,
297            Ok(Event::Eof) => SectionAction::Break,
298            Err(_) => SectionAction::Break,
299            _ => SectionAction::Continue,
300        };
301        buf.clear();
302
303        match action {
304            SectionAction::SkipTitle => {
305                if let Ok(t) = read_text_content(reader, b"title", buf) {
306                    let t = t.trim().to_string();
307                    if !t.is_empty() {
308                        title = Some(t);
309                    }
310                }
311            }
312            SectionAction::ReadParagraph => {
313                let (text, inline_figs, inline_tables) = read_paragraph_with_inline(reader, buf);
314                let trimmed = text.trim().to_string();
315                if !trimmed.is_empty() {
316                    content_parts.push(trimmed);
317                }
318                figures.extend(inline_figs);
319                tables.extend(inline_tables);
320            }
321            SectionAction::ReadSection(sub_id) => {
322                // Recursive: properly handles nested sections
323                if let Some(sub) = parse_section_from_body(reader, sub_id, buf) {
324                    subsections.push(sub);
325                }
326            }
327            SectionAction::ReadFigure(attrs) => {
328                if let Some(fig) = parse_figure_inner(reader, attrs, buf) {
329                    figures.push(fig);
330                }
331            }
332            SectionAction::ReadTable(attrs) => {
333                if let Some(table) = parse_table_inner(reader, attrs, buf) {
334                    tables.push(table);
335                }
336            }
337            SectionAction::ReadTextElement(tag) => {
338                if let Ok(text) = read_text_content(reader, &tag, buf) {
339                    let trimmed = text.trim().to_string();
340                    if !trimmed.is_empty() {
341                        content_parts.push(trimmed);
342                    }
343                }
344            }
345            SectionAction::SkipTag(name) => {
346                let _ = skip_element(reader, QName(&name), buf);
347            }
348            SectionAction::Break => break,
349            _ => {}
350        }
351    }
352
353    let section_content = content_parts.join("\n");
354
355    if !section_content.trim().is_empty()
356        || !subsections.is_empty()
357        || !figures.is_empty()
358        || !tables.is_empty()
359    {
360        Some(Section {
361            id,
362            section_type: Some("section".to_string()),
363            label: None,
364            title,
365            content: section_content.trim().to_string(),
366            subsections,
367            figures,
368            tables,
369            formulas: Vec::new(),
370        })
371    } else {
372        None
373    }
374}
375
376/// Read a `<p>` element, collecting text while extracting inline figures and tables.
377///
378/// Uses Cow<str> from unescape() to avoid allocations when text has no XML entities.
379/// Detects `<fig>` and `<table-wrap>` inside `<p>` and parses them as structured data.
380fn read_paragraph_with_inline(
381    reader: &mut quick_xml::Reader<&[u8]>,
382    buf: &mut Vec<u8>,
383) -> (String, Vec<Figure>, Vec<Table>) {
384    let mut text = String::new();
385    let mut figures = Vec::new();
386    let mut tables = Vec::new();
387    let mut depth: u32 = 1; // We're inside <p>
388    // Deferred figure/table parsing to avoid borrow conflicts
389    let mut deferred_figs: Vec<FigAttrs> = Vec::new();
390    let mut deferred_tables: Vec<TableAttrs> = Vec::new();
391
392    loop {
393        match reader.read_event_into(buf) {
394            Ok(Event::Start(ref e)) => match e.name().as_ref() {
395                b"p" => depth += 1,
396                b"fig" => {
397                    deferred_figs.push(FigAttrs {
398                        id: get_attr(e, b"id"),
399                        fig_type: get_attr(e, b"fig-type"),
400                    });
401                }
402                b"table-wrap" => {
403                    deferred_tables.push(TableAttrs {
404                        id: get_attr(e, b"id"),
405                    });
406                }
407                _ => {} // Skip child tags, keep reading for text
408            },
409            Ok(Event::Text(ref e)) => {
410                // Use Cow: borrows when no entities, only allocates when unescaping
411                if let Ok(unescaped) = e.unescape() {
412                    text.push_str(&unescaped);
413                }
414            }
415            Ok(Event::End(ref e)) => {
416                if e.name().as_ref() == b"p" {
417                    depth -= 1;
418                    if depth == 0 {
419                        buf.clear();
420                        break;
421                    }
422                }
423            }
424            Ok(Event::Eof) => {
425                buf.clear();
426                break;
427            }
428            Err(_) => {
429                buf.clear();
430                break;
431            }
432            _ => {}
433        }
434        buf.clear();
435
436        // Process deferred figures/tables (buf is cleared, safe to use)
437        for attrs in deferred_figs.drain(..) {
438            if let Some(fig) = parse_figure_inner(reader, attrs, buf) {
439                figures.push(fig);
440            }
441        }
442        for attrs in deferred_tables.drain(..) {
443            if let Some(table) = parse_table_inner(reader, attrs, buf) {
444                tables.push(table);
445            }
446        }
447    }
448
449    (text.trim().to_string(), figures, tables)
450}
451
452// --- Figure and Table extraction using Reader scan ---
453
454/// Extract all `<fig>` elements from content using Reader.
455/// Scans the entire content string regardless of nesting depth.
456fn extract_figures_from_content(content: &str) -> Vec<Figure> {
457    let mut figures = Vec::new();
458    let mut reader = make_reader(content);
459    let mut buf = Vec::new();
460
461    loop {
462        let attrs = match reader.read_event_into(&mut buf) {
463            Ok(Event::Start(ref e)) if e.name().as_ref() == b"fig" => Some(FigAttrs {
464                id: get_attr(e, b"id"),
465                fig_type: get_attr(e, b"fig-type"),
466            }),
467            Ok(Event::Eof) => break,
468            Err(_) => break,
469            _ => None,
470        };
471        buf.clear();
472
473        if let Some(attrs) = attrs
474            && let Some(fig) = parse_figure_inner(&mut reader, attrs, &mut buf)
475        {
476            figures.push(fig);
477        }
478    }
479
480    figures
481}
482
483/// Extract all `<table-wrap>` elements from content using Reader.
484fn extract_tables_from_content(content: &str) -> Vec<Table> {
485    let mut tables = Vec::new();
486    let mut reader = make_reader(content);
487    let mut buf = Vec::new();
488
489    loop {
490        let attrs = match reader.read_event_into(&mut buf) {
491            Ok(Event::Start(ref e)) if e.name().as_ref() == b"table-wrap" => Some(TableAttrs {
492                id: get_attr(e, b"id"),
493            }),
494            Ok(Event::Eof) => break,
495            Err(_) => break,
496            _ => None,
497        };
498        buf.clear();
499
500        if let Some(attrs) = attrs
501            && let Some(table) = parse_table_inner(&mut reader, attrs, &mut buf)
502        {
503            tables.push(table);
504        }
505    }
506
507    tables
508}
509
510struct FigAttrs {
511    id: Option<String>,
512    fig_type: Option<String>,
513}
514
515struct TableAttrs {
516    id: Option<String>,
517}
518
519/// Parse figure content after `Event::Start` for `<fig>` has been consumed.
520fn parse_figure_inner(
521    reader: &mut quick_xml::Reader<&[u8]>,
522    attrs: FigAttrs,
523    buf: &mut Vec<u8>,
524) -> Option<Figure> {
525    let mut label: Option<String> = None;
526    let mut caption: Option<String> = None;
527    let mut alt_text: Option<String> = None;
528    let mut file_name: Option<String> = None;
529
530    loop {
531        let action = match reader.read_event_into(buf) {
532            Ok(Event::Start(ref e)) => match e.name().as_ref() {
533                b"label" => FigAction::ReadLabel,
534                b"caption" => FigAction::ReadCaption,
535                b"alt-text" => FigAction::ReadAltText,
536                b"graphic" => {
537                    let href = get_attr(e, b"xlink:href").or_else(|| get_attr(e, b"href"));
538                    FigAction::ReadGraphic(href)
539                }
540                other => FigAction::Skip(other.to_vec()),
541            },
542            Ok(Event::End(ref e)) if e.name().as_ref() == b"fig" => FigAction::Done,
543            Ok(Event::Eof) => FigAction::Done,
544            Err(_) => FigAction::Done,
545            _ => FigAction::Continue,
546        };
547        buf.clear();
548
549        match action {
550            FigAction::ReadLabel => {
551                label = read_text_content(reader, b"label", buf).ok();
552            }
553            FigAction::ReadCaption => {
554                caption = Some(
555                    read_text_content(reader, b"caption", buf)
556                        .unwrap_or_else(|_| "No caption available".to_string()),
557                );
558            }
559            FigAction::ReadAltText => {
560                alt_text = read_text_content(reader, b"alt-text", buf).ok();
561            }
562            FigAction::ReadGraphic(href) => {
563                file_name = href;
564                let _ = skip_element(reader, QName(b"graphic"), buf);
565            }
566            FigAction::Skip(name) => {
567                let _ = skip_element(reader, QName(&name), buf);
568            }
569            FigAction::Done => break,
570            FigAction::Continue => {}
571        }
572    }
573
574    Some(Figure {
575        id: attrs.id.unwrap_or_else(|| "fig_unknown".to_string()),
576        label,
577        caption: caption.unwrap_or_else(|| "No caption available".to_string()),
578        alt_text,
579        fig_type: attrs.fig_type,
580        graphic_href: file_name,
581    })
582}
583
584enum FigAction {
585    Continue,
586    Done,
587    ReadLabel,
588    ReadCaption,
589    ReadAltText,
590    ReadGraphic(Option<String>),
591    Skip(Vec<u8>),
592}
593
594/// Parse table-wrap content after `Event::Start` for `<table-wrap>` has been consumed.
595fn parse_table_inner(
596    reader: &mut quick_xml::Reader<&[u8]>,
597    attrs: TableAttrs,
598    buf: &mut Vec<u8>,
599) -> Option<Table> {
600    let mut label: Option<String> = None;
601    let mut caption: Option<String> = None;
602    let mut footnotes = Vec::new();
603
604    loop {
605        let action = match reader.read_event_into(buf) {
606            Ok(Event::Start(ref e)) => match e.name().as_ref() {
607                b"label" => TableAction::ReadLabel,
608                b"caption" => TableAction::ReadCaption,
609                b"table-wrap-foot" => TableAction::ReadFootnote,
610                other => TableAction::Skip(other.to_vec()),
611            },
612            Ok(Event::End(ref e)) if e.name().as_ref() == b"table-wrap" => TableAction::Done,
613            Ok(Event::Eof) => TableAction::Done,
614            Err(_) => TableAction::Done,
615            _ => TableAction::Continue,
616        };
617        buf.clear();
618
619        match action {
620            TableAction::ReadLabel => {
621                label = read_text_content(reader, b"label", buf).ok();
622            }
623            TableAction::ReadCaption => {
624                caption = Some(
625                    read_text_content(reader, b"caption", buf)
626                        .unwrap_or_else(|_| "No caption available".to_string()),
627                );
628            }
629            TableAction::ReadFootnote => {
630                if let Ok(text) = read_text_content(reader, b"table-wrap-foot", buf) {
631                    let trimmed = text.trim().to_string();
632                    if !trimmed.is_empty() {
633                        footnotes.push(trimmed);
634                    }
635                }
636            }
637            TableAction::Skip(name) => {
638                let _ = skip_element(reader, QName(&name), buf);
639            }
640            TableAction::Done => break,
641            TableAction::Continue => {}
642        }
643    }
644
645    Some(Table {
646        id: attrs.id.unwrap_or_else(|| "table_unknown".to_string()),
647        label,
648        caption: caption.unwrap_or_else(|| "No caption available".to_string()),
649        head: Vec::new(),
650        body: Vec::new(),
651        footnotes,
652    })
653}
654
655enum TableAction {
656    Continue,
657    Done,
658    ReadLabel,
659    ReadCaption,
660    ReadFootnote,
661    Skip(Vec<u8>),
662}
663
664/// Extract section title from section content
665pub fn extract_section_title(content: &str) -> Option<String> {
666    xml_utils::extract_text_between(content, "<title>", "</title>")
667}
668
669/// Extract section ID from section content
670pub fn extract_section_id(content: &str) -> Option<String> {
671    xml_utils::extract_attribute_value(content, "id")
672}
673
674/// Extract all paragraph content from a section
675pub fn extract_paragraph_content(content: &str) -> Vec<String> {
676    let mut paragraphs = Vec::new();
677    let mut reader = make_reader(content);
678    let mut buf = Vec::new();
679
680    loop {
681        let is_p = match reader.read_event_into(&mut buf) {
682            Ok(Event::Start(ref e)) if e.name().as_ref() == b"p" => true,
683            Ok(Event::Eof) => break,
684            Err(_) => break,
685            _ => false,
686        };
687        buf.clear();
688
689        if is_p && let Ok(text) = read_text_content(&mut reader, b"p", &mut buf) {
690            let trimmed = text.trim().to_string();
691            if !trimmed.is_empty() {
692                paragraphs.push(trimmed);
693            }
694        }
695    }
696
697    paragraphs
698}
699
700#[cfg(test)]
701mod tests {
702    use super::*;
703
704    #[test]
705    fn test_extract_abstract_section() {
706        let content = r#"
707        <abstract>
708            <p>This is an abstract paragraph.</p>
709        </abstract>
710        "#;
711
712        let section = extract_abstract_section(content);
713        assert!(section.is_some());
714
715        let section = section.unwrap();
716        assert_eq!(section.section_type, Some("abstract".to_string()));
717        assert_eq!(section.title, Some("Abstract".to_string()));
718        assert!(section.content.contains("This is an abstract paragraph."));
719    }
720
721    #[test]
722    fn test_extract_section_title() {
723        let content = r#"<sec id="sec1"><title>Introduction</title><p>Content</p></sec>"#;
724        let title = extract_section_title(content);
725        assert_eq!(title, Some("Introduction".to_string()));
726    }
727
728    #[test]
729    fn test_extract_section_id() {
730        let content = r#"<sec id="sec1"><title>Introduction</title><p>Content</p></sec>"#;
731        let id = extract_section_id(content);
732        assert_eq!(id, Some("sec1".to_string()));
733    }
734
735    #[test]
736    fn test_extract_paragraph_content() {
737        let content = r#"
738        <p>First paragraph.</p>
739        <p>Second paragraph with <em>emphasis</em>.</p>
740        "#;
741
742        let paragraphs = extract_paragraph_content(content);
743        assert_eq!(paragraphs.len(), 2);
744        assert_eq!(paragraphs[0], "First paragraph.");
745        assert_eq!(paragraphs[1], "Second paragraph with emphasis.");
746    }
747
748    #[test]
749    fn test_extract_figures_from_section() {
750        let content = r#"
751        <fig id="fig1" fig-type="diagram">
752            <label>Figure 1</label>
753            <caption>This is a test figure.</caption>
754            <alt-text>Alternative text</alt-text>
755        </fig>
756        "#;
757
758        let figures = extract_figures_from_content(content);
759        assert_eq!(figures.len(), 1);
760        assert_eq!(figures[0].id, "fig1");
761        assert_eq!(figures[0].label, Some("Figure 1".to_string()));
762        assert_eq!(figures[0].caption, "This is a test figure.");
763        assert_eq!(figures[0].alt_text, Some("Alternative text".to_string()));
764        assert_eq!(figures[0].fig_type, Some("diagram".to_string()));
765    }
766
767    #[test]
768    fn test_extract_tables_from_section() {
769        let content = r#"
770        <root>
771        <table-wrap id="table1">
772            <label>Table 1</label>
773            <caption>This is a test table.</caption>
774            <table>
775                <tr><th>Header</th></tr>
776                <tr><td>Data</td></tr>
777            </table>
778        </table-wrap>
779        </root>
780        "#;
781
782        let tables = extract_tables_from_content(content);
783        assert_eq!(tables.len(), 1);
784        assert_eq!(tables[0].id, "table1");
785        assert_eq!(tables[0].label, Some("Table 1".to_string()));
786        assert_eq!(tables[0].caption, "This is a test table.");
787    }
788
789    #[test]
790    fn test_nested_sections_depth() {
791        let content = r#"
792        <body>
793        <sec id="sec1">
794            <title>Methods</title>
795            <sec id="sec1.1">
796                <title>Study Design</title>
797                <p>Inner content.</p>
798            </sec>
799            <p>Outer content after subsection.</p>
800        </sec>
801        </body>
802        "#;
803
804        let sections = extract_sections_enhanced(content);
805        assert_eq!(sections.len(), 1);
806
807        let methods = &sections[0];
808        assert_eq!(methods.title, Some("Methods".to_string()));
809        assert!(methods.content.contains("Outer content"));
810        assert_eq!(methods.subsections.len(), 1);
811        assert_eq!(
812            methods.subsections[0].title,
813            Some("Study Design".to_string())
814        );
815        assert!(methods.subsections[0].content.contains("Inner content"));
816    }
817
818    #[test]
819    fn test_body_without_sections() {
820        let content = r#"
821        <body>
822            <p>Just a paragraph.</p>
823            <p>Another paragraph.</p>
824        </body>
825        "#;
826
827        let sections = extract_sections_enhanced(content);
828        assert_eq!(sections.len(), 1);
829        assert_eq!(sections[0].section_type, Some("body".to_string()));
830        assert!(sections[0].content.contains("Just a paragraph."));
831        assert!(sections[0].content.contains("Another paragraph."));
832    }
833
834    #[test]
835    fn test_inline_figure_in_paragraph() {
836        let content = r#"
837        <body>
838            <p>Some text <fig id="fig1"><label>Figure 1</label><caption>Test caption</caption><graphic xlink:href="fig1.jpg"/></fig> more text.</p>
839        </body>
840        "#;
841
842        let sections = extract_sections_enhanced(content);
843        assert_eq!(sections.len(), 1);
844        // Figures should be found even when inline in <p>
845        assert!(
846            !sections[0].figures.is_empty(),
847            "Expected figures to be extracted from inline position"
848        );
849        assert_eq!(sections[0].figures[0].id, "fig1");
850    }
851
852    // --- Tests for JATS %para-level; elements that were previously skipped ---
853
854    #[test]
855    fn test_list_text_extraction_in_section() {
856        let content = r#"
857        <body>
858        <sec id="sec1">
859            <title>Methods</title>
860            <p>Before list.</p>
861            <list list-type="bullet">
862                <list-item><p>First item</p></list-item>
863                <list-item><p>Second item</p></list-item>
864            </list>
865            <p>After list.</p>
866        </sec>
867        </body>
868        "#;
869
870        let sections = extract_sections_enhanced(content);
871        assert_eq!(sections.len(), 1);
872        let section = &sections[0];
873        assert!(section.content.contains("Before list."));
874        assert!(section.content.contains("First item"));
875        assert!(section.content.contains("Second item"));
876        assert!(section.content.contains("After list."));
877    }
878
879    #[test]
880    fn test_def_list_text_extraction_in_section() {
881        let content = r#"
882        <body>
883        <sec id="sec1">
884            <title>Abbreviations</title>
885            <def-list>
886                <def-item>
887                    <term>DNA</term>
888                    <def><p>Deoxyribonucleic acid</p></def>
889                </def-item>
890                <def-item>
891                    <term>RNA</term>
892                    <def><p>Ribonucleic acid</p></def>
893                </def-item>
894            </def-list>
895        </sec>
896        </body>
897        "#;
898
899        let sections = extract_sections_enhanced(content);
900        assert_eq!(sections.len(), 1);
901        let section = &sections[0];
902        assert!(section.content.contains("DNA"));
903        assert!(section.content.contains("Deoxyribonucleic acid"));
904        assert!(section.content.contains("RNA"));
905        assert!(section.content.contains("Ribonucleic acid"));
906    }
907
908    #[test]
909    fn test_disp_formula_text_extraction() {
910        let content = r#"
911        <body>
912        <sec id="sec1">
913            <title>Model</title>
914            <p>The equation is:</p>
915            <disp-formula id="eq1">
916                <label>(1)</label>
917                <tex-math>E = mc^2</tex-math>
918            </disp-formula>
919        </sec>
920        </body>
921        "#;
922
923        let sections = extract_sections_enhanced(content);
924        assert_eq!(sections.len(), 1);
925        let section = &sections[0];
926        assert!(section.content.contains("The equation is:"));
927        assert!(
928            section.content.contains("E = mc^2"),
929            "Formula text should be extracted, got: {}",
930            section.content
931        );
932    }
933
934    #[test]
935    fn test_boxed_text_extraction() {
936        let content = r#"
937        <body>
938        <sec id="sec1">
939            <title>Results</title>
940            <boxed-text>
941                <title>Key Finding</title>
942                <p>Important result goes here.</p>
943            </boxed-text>
944        </sec>
945        </body>
946        "#;
947
948        let sections = extract_sections_enhanced(content);
949        assert_eq!(sections.len(), 1);
950        let section = &sections[0];
951        assert!(
952            section.content.contains("Important result goes here."),
953            "Boxed text content should be extracted, got: {}",
954            section.content
955        );
956    }
957
958    #[test]
959    fn test_code_extraction() {
960        let content = r#"
961        <body>
962        <sec id="sec1">
963            <title>Implementation</title>
964            <code language="python">print("hello world")</code>
965        </sec>
966        </body>
967        "#;
968
969        let sections = extract_sections_enhanced(content);
970        assert_eq!(sections.len(), 1);
971        let section = &sections[0];
972        assert!(
973            section.content.contains("print(\"hello world\")"),
974            "Code content should be extracted, got: {}",
975            section.content
976        );
977    }
978
979    #[test]
980    fn test_disp_quote_extraction() {
981        let content = r#"
982        <body>
983        <sec id="sec1">
984            <title>Discussion</title>
985            <disp-quote>
986                <p>To be or not to be, that is the question.</p>
987            </disp-quote>
988        </sec>
989        </body>
990        "#;
991
992        let sections = extract_sections_enhanced(content);
993        assert_eq!(sections.len(), 1);
994        let section = &sections[0];
995        assert!(section.content.contains("To be or not to be"));
996    }
997
998    #[test]
999    fn test_preformat_extraction() {
1000        let content = r#"
1001        <body>
1002        <sec id="sec1">
1003            <title>Data</title>
1004            <preformat>
1005Column1  Column2  Column3
1006value1   value2   value3
1007            </preformat>
1008        </sec>
1009        </body>
1010        "#;
1011
1012        let sections = extract_sections_enhanced(content);
1013        assert_eq!(sections.len(), 1);
1014        let section = &sections[0];
1015        assert!(section.content.contains("Column1"));
1016        assert!(section.content.contains("value1"));
1017    }
1018
1019    #[test]
1020    fn test_mixed_elements_in_section() {
1021        let content = r#"
1022        <body>
1023        <sec id="sec1">
1024            <title>Mixed Content</title>
1025            <p>Paragraph text.</p>
1026            <list list-type="order">
1027                <list-item><p>Ordered item one</p></list-item>
1028                <list-item><p>Ordered item two</p></list-item>
1029            </list>
1030            <fig id="fig1">
1031                <label>Figure 1</label>
1032                <caption>A test figure</caption>
1033            </fig>
1034            <disp-formula id="eq1">
1035                <label>(2)</label>
1036                <tex-math>a^2 + b^2 = c^2</tex-math>
1037            </disp-formula>
1038            <p>Final paragraph.</p>
1039        </sec>
1040        </body>
1041        "#;
1042
1043        let sections = extract_sections_enhanced(content);
1044        assert_eq!(sections.len(), 1);
1045        let section = &sections[0];
1046        assert!(section.content.contains("Paragraph text."));
1047        assert!(section.content.contains("Ordered item one"));
1048        assert!(section.content.contains("a^2 + b^2 = c^2"));
1049        assert!(section.content.contains("Final paragraph."));
1050        assert_eq!(section.figures.len(), 1);
1051        assert_eq!(section.figures[0].id, "fig1");
1052    }
1053
1054    #[test]
1055    fn test_body_without_sec_with_list() {
1056        let content = r#"
1057        <body>
1058            <p>Introduction paragraph.</p>
1059            <list list-type="bullet">
1060                <list-item><p>Bullet point one</p></list-item>
1061                <list-item><p>Bullet point two</p></list-item>
1062            </list>
1063            <p>Conclusion paragraph.</p>
1064        </body>
1065        "#;
1066
1067        let sections = extract_sections_enhanced(content);
1068        assert_eq!(sections.len(), 1);
1069        assert_eq!(sections[0].section_type, Some("body".to_string()));
1070        assert!(sections[0].content.contains("Introduction paragraph."));
1071        assert!(sections[0].content.contains("Bullet point one"));
1072        assert!(sections[0].content.contains("Bullet point two"));
1073        assert!(sections[0].content.contains("Conclusion paragraph."));
1074    }
1075
1076    #[test]
1077    fn test_media_in_section() {
1078        let content = r#"
1079        <body>
1080        <sec id="sec1">
1081            <title>Supplementary</title>
1082            <media mimetype="video" xlink:href="movie1.mp4">
1083                <caption><p>Supplementary Movie 1</p></caption>
1084            </media>
1085        </sec>
1086        </body>
1087        "#;
1088
1089        let sections = extract_sections_enhanced(content);
1090        assert_eq!(sections.len(), 1);
1091        let section = &sections[0];
1092        assert!(
1093            section.content.contains("Supplementary Movie 1"),
1094            "Media caption should be extracted, got: {}",
1095            section.content
1096        );
1097    }
1098
1099    #[test]
1100    fn test_fn_group_in_section() {
1101        let content = r#"
1102        <body>
1103        <sec id="sec1">
1104            <title>Notes</title>
1105            <p>Main text.</p>
1106            <fn-group>
1107                <fn id="fn1"><p>Author contribution note.</p></fn>
1108                <fn id="fn2"><p>Funding disclosure.</p></fn>
1109            </fn-group>
1110        </sec>
1111        </body>
1112        "#;
1113
1114        let sections = extract_sections_enhanced(content);
1115        assert_eq!(sections.len(), 1);
1116        let section = &sections[0];
1117        assert!(section.content.contains("Main text."));
1118        assert!(
1119            section.content.contains("Author contribution note."),
1120            "fn-group content should be extracted, got: {}",
1121            section.content
1122        );
1123    }
1124}
pubmed_parser/pmc/parser/section.rs

pubmed_parser/pmc/parser/
section.rs