1use crate::pmc::domain::{Figure, Section, Table};
2
3use super::reader_utils::{get_attr, make_reader, read_text_content, skip_element};
4use super::xml_utils;
5use quick_xml::events::Event;
6use quick_xml::name::QName;
7
8pub fn extract_sections_enhanced(content: &str) -> Vec<Section> {
10 let mut sections = Vec::new();
11
12 if let Some(abstract_section) = extract_abstract_section(content) {
14 sections.push(abstract_section);
15 }
16
17 if let Some(body_start) = content.find("<body>")
19 && let Some(body_end) = content[body_start..].find("</body>")
20 {
21 let body_content = &content[body_start + 6..body_start + body_end];
22 sections.extend(extract_body_sections(body_content));
23 }
24
25 if let Some(floats_start) = content.find("<floats-group>")
27 && let Some(floats_end) = content[floats_start..].find("</floats-group>")
28 {
29 let floats_content =
30 &content[floats_start..floats_start + floats_end + "</floats-group>".len()];
31 let float_figures = extract_figures_from_content(floats_content);
32 if !float_figures.is_empty() {
33 if let Some(first_section) = sections.first_mut() {
34 first_section.figures.extend(float_figures);
35 } else {
36 sections.push(Section {
37 id: None,
38 section_type: Some("figures".to_string()),
39 label: None,
40 title: Some("Figures".to_string()),
41 content: String::new(),
42 subsections: Vec::new(),
43 figures: float_figures,
44 tables: Vec::new(),
45 formulas: Vec::new(),
46 });
47 }
48 }
49 }
50
51 sections
52}
53
54fn extract_abstract_section(content: &str) -> Option<Section> {
56 let abstract_start = content.find("<abstract")?;
57 let abstract_end_offset = content[abstract_start..].find("</abstract>")?;
58 let abstract_xml =
59 &content[abstract_start..abstract_start + abstract_end_offset + "</abstract>".len()];
60
61 let mut reader = make_reader(abstract_xml);
63 let mut buf = Vec::new();
64 let mut text_parts = Vec::new();
65 let mut in_abstract = false;
66
67 loop {
68 let action = match reader.read_event_into(&mut buf) {
69 Ok(Event::Start(ref e)) => match e.name().as_ref() {
70 b"abstract" => SectionAction::EnterAbstract,
71 b"p" if in_abstract => SectionAction::ReadParagraph,
72 b"title" if in_abstract => SectionAction::SkipTitle,
73 _ => SectionAction::Continue,
74 },
75 Ok(Event::End(ref e)) if e.name().as_ref() == b"abstract" => SectionAction::Break,
76 Ok(Event::Eof) => SectionAction::Break,
77 Err(_) => SectionAction::Break,
78 _ => SectionAction::Continue,
79 };
80 buf.clear();
81
82 match action {
83 SectionAction::EnterAbstract => in_abstract = true,
84 SectionAction::ReadParagraph => {
85 if let Ok(text) = read_text_content(&mut reader, b"p", &mut buf) {
86 let trimmed = text.trim().to_string();
87 if !trimmed.is_empty() {
88 text_parts.push(trimmed);
89 }
90 }
91 }
92 SectionAction::SkipTitle => {
93 let _ = read_text_content(&mut reader, b"title", &mut buf);
94 }
95 SectionAction::Break => break,
96 _ => {}
97 }
98 }
99
100 let figures = extract_figures_from_content(abstract_xml);
102 let tables = extract_tables_from_content(abstract_xml);
103
104 let clean_content = text_parts.join("\n");
105 if !clean_content.is_empty() {
106 Some(Section {
107 id: None,
108 section_type: Some("abstract".to_string()),
109 label: None,
110 title: Some("Abstract".to_string()),
111 content: clean_content,
112 subsections: Vec::new(),
113 figures,
114 tables,
115 formulas: Vec::new(),
116 })
117 } else {
118 None
119 }
120}
121
122enum SectionAction {
124 Continue,
125 Break,
126 EnterAbstract,
127 ReadParagraph,
128 ReadSection(Option<String>),
129 ReadBodyParagraph,
130 ReadFigure(FigAttrs),
131 ReadTable(TableAttrs),
132 ReadTextElement(Vec<u8>),
134 SkipTitle,
135 SkipTag(Vec<u8>),
136}
137
138fn extract_body_sections(content: &str) -> Vec<Section> {
140 let mut reader = make_reader(content);
141 let mut buf = Vec::new();
142 let mut sections = Vec::new();
143 let mut has_sec_tags = false;
144 let mut body_paragraphs = Vec::new();
145 let mut body_figures = Vec::new();
146 let mut body_tables = Vec::new();
147
148 loop {
149 let action = match reader.read_event_into(&mut buf) {
150 Ok(Event::Start(ref e)) => match e.name().as_ref() {
151 b"sec" => SectionAction::ReadSection(get_attr(e, b"id")),
152 b"p" if !has_sec_tags => SectionAction::ReadBodyParagraph,
153 b"fig" if !has_sec_tags => SectionAction::ReadFigure(FigAttrs {
154 id: get_attr(e, b"id"),
155 fig_type: get_attr(e, b"fig-type"),
156 }),
157 b"table-wrap" if !has_sec_tags => SectionAction::ReadTable(TableAttrs {
158 id: get_attr(e, b"id"),
159 }),
160 b"list"
162 | b"def-list"
163 | b"disp-formula"
164 | b"disp-formula-group"
165 | b"disp-quote"
166 | b"boxed-text"
167 | b"code"
168 | b"preformat"
169 | b"media"
170 | b"supplementary-material"
171 | b"speech"
172 | b"statement"
173 | b"verse-group"
174 | b"array"
175 | b"graphic"
176 | b"fn-group"
177 if !has_sec_tags =>
178 {
179 SectionAction::ReadTextElement(e.name().as_ref().to_vec())
180 }
181 _ => SectionAction::Continue,
182 },
183 Ok(Event::Eof) => SectionAction::Break,
184 Err(_) => SectionAction::Break,
185 _ => SectionAction::Continue,
186 };
187 buf.clear();
188
189 match action {
190 SectionAction::ReadSection(id) => {
191 has_sec_tags = true;
192 if let Some(section) = parse_section_from_body(&mut reader, id, &mut buf) {
193 sections.push(section);
194 }
195 }
196 SectionAction::ReadBodyParagraph => {
197 let (text, inline_figs, inline_tables) =
198 read_paragraph_with_inline(&mut reader, &mut buf);
199 if !text.is_empty() {
200 body_paragraphs.push(text);
201 }
202 body_figures.extend(inline_figs);
203 body_tables.extend(inline_tables);
204 }
205 SectionAction::ReadFigure(attrs) => {
206 if let Some(fig) = parse_figure_inner(&mut reader, attrs, &mut buf) {
207 body_figures.push(fig);
208 }
209 }
210 SectionAction::ReadTable(attrs) => {
211 if let Some(table) = parse_table_inner(&mut reader, attrs, &mut buf) {
212 body_tables.push(table);
213 }
214 }
215 SectionAction::ReadTextElement(tag) => {
216 if let Ok(text) = read_text_content(&mut reader, &tag, &mut buf) {
217 let trimmed = text.trim().to_string();
218 if !trimmed.is_empty() {
219 body_paragraphs.push(trimmed);
220 }
221 }
222 }
223 SectionAction::Break => break,
224 _ => {}
225 }
226 }
227
228 if sections.is_empty() && !body_paragraphs.is_empty() {
230 let text = body_paragraphs.join("\n");
231 sections.push(Section {
232 id: None,
233 section_type: Some("body".to_string()),
234 label: None,
235 title: None,
236 content: text,
237 subsections: Vec::new(),
238 figures: body_figures,
239 tables: body_tables,
240 formulas: Vec::new(),
241 });
242 }
243
244 sections
245}
246
247fn parse_section_from_body(
254 reader: &mut quick_xml::Reader<&[u8]>,
255 id: Option<String>,
256 buf: &mut Vec<u8>,
257) -> Option<Section> {
258 let mut title: Option<String> = None;
259 let mut content_parts: Vec<String> = Vec::new();
260 let mut subsections = Vec::new();
261 let mut figures = Vec::new();
262 let mut tables = Vec::new();
263
264 loop {
265 let action = match reader.read_event_into(buf) {
266 Ok(Event::Start(ref e)) => match e.name().as_ref() {
267 b"title" => SectionAction::SkipTitle,
268 b"p" => SectionAction::ReadParagraph,
269 b"sec" => SectionAction::ReadSection(get_attr(e, b"id")),
270 b"fig" => SectionAction::ReadFigure(FigAttrs {
271 id: get_attr(e, b"id"),
272 fig_type: get_attr(e, b"fig-type"),
273 }),
274 b"table-wrap" => SectionAction::ReadTable(TableAttrs {
275 id: get_attr(e, b"id"),
276 }),
277 b"list"
279 | b"def-list"
280 | b"disp-formula"
281 | b"disp-formula-group"
282 | b"disp-quote"
283 | b"boxed-text"
284 | b"code"
285 | b"preformat"
286 | b"media"
287 | b"supplementary-material"
288 | b"speech"
289 | b"statement"
290 | b"verse-group"
291 | b"array"
292 | b"graphic"
293 | b"fn-group" => SectionAction::ReadTextElement(e.name().as_ref().to_vec()),
294 other => SectionAction::SkipTag(other.to_vec()),
295 },
296 Ok(Event::End(ref e)) if e.name().as_ref() == b"sec" => SectionAction::Break,
297 Ok(Event::Eof) => SectionAction::Break,
298 Err(_) => SectionAction::Break,
299 _ => SectionAction::Continue,
300 };
301 buf.clear();
302
303 match action {
304 SectionAction::SkipTitle => {
305 if let Ok(t) = read_text_content(reader, b"title", buf) {
306 let t = t.trim().to_string();
307 if !t.is_empty() {
308 title = Some(t);
309 }
310 }
311 }
312 SectionAction::ReadParagraph => {
313 let (text, inline_figs, inline_tables) = read_paragraph_with_inline(reader, buf);
314 let trimmed = text.trim().to_string();
315 if !trimmed.is_empty() {
316 content_parts.push(trimmed);
317 }
318 figures.extend(inline_figs);
319 tables.extend(inline_tables);
320 }
321 SectionAction::ReadSection(sub_id) => {
322 if let Some(sub) = parse_section_from_body(reader, sub_id, buf) {
324 subsections.push(sub);
325 }
326 }
327 SectionAction::ReadFigure(attrs) => {
328 if let Some(fig) = parse_figure_inner(reader, attrs, buf) {
329 figures.push(fig);
330 }
331 }
332 SectionAction::ReadTable(attrs) => {
333 if let Some(table) = parse_table_inner(reader, attrs, buf) {
334 tables.push(table);
335 }
336 }
337 SectionAction::ReadTextElement(tag) => {
338 if let Ok(text) = read_text_content(reader, &tag, buf) {
339 let trimmed = text.trim().to_string();
340 if !trimmed.is_empty() {
341 content_parts.push(trimmed);
342 }
343 }
344 }
345 SectionAction::SkipTag(name) => {
346 let _ = skip_element(reader, QName(&name), buf);
347 }
348 SectionAction::Break => break,
349 _ => {}
350 }
351 }
352
353 let section_content = content_parts.join("\n");
354
355 if !section_content.trim().is_empty()
356 || !subsections.is_empty()
357 || !figures.is_empty()
358 || !tables.is_empty()
359 {
360 Some(Section {
361 id,
362 section_type: Some("section".to_string()),
363 label: None,
364 title,
365 content: section_content.trim().to_string(),
366 subsections,
367 figures,
368 tables,
369 formulas: Vec::new(),
370 })
371 } else {
372 None
373 }
374}
375
376fn read_paragraph_with_inline(
381 reader: &mut quick_xml::Reader<&[u8]>,
382 buf: &mut Vec<u8>,
383) -> (String, Vec<Figure>, Vec<Table>) {
384 let mut text = String::new();
385 let mut figures = Vec::new();
386 let mut tables = Vec::new();
387 let mut depth: u32 = 1; let mut deferred_figs: Vec<FigAttrs> = Vec::new();
390 let mut deferred_tables: Vec<TableAttrs> = Vec::new();
391
392 loop {
393 match reader.read_event_into(buf) {
394 Ok(Event::Start(ref e)) => match e.name().as_ref() {
395 b"p" => depth += 1,
396 b"fig" => {
397 deferred_figs.push(FigAttrs {
398 id: get_attr(e, b"id"),
399 fig_type: get_attr(e, b"fig-type"),
400 });
401 }
402 b"table-wrap" => {
403 deferred_tables.push(TableAttrs {
404 id: get_attr(e, b"id"),
405 });
406 }
407 _ => {} },
409 Ok(Event::Text(ref e)) => {
410 if let Ok(unescaped) = e.unescape() {
412 text.push_str(&unescaped);
413 }
414 }
415 Ok(Event::End(ref e)) => {
416 if e.name().as_ref() == b"p" {
417 depth -= 1;
418 if depth == 0 {
419 buf.clear();
420 break;
421 }
422 }
423 }
424 Ok(Event::Eof) => {
425 buf.clear();
426 break;
427 }
428 Err(_) => {
429 buf.clear();
430 break;
431 }
432 _ => {}
433 }
434 buf.clear();
435
436 for attrs in deferred_figs.drain(..) {
438 if let Some(fig) = parse_figure_inner(reader, attrs, buf) {
439 figures.push(fig);
440 }
441 }
442 for attrs in deferred_tables.drain(..) {
443 if let Some(table) = parse_table_inner(reader, attrs, buf) {
444 tables.push(table);
445 }
446 }
447 }
448
449 (text.trim().to_string(), figures, tables)
450}
451
452fn extract_figures_from_content(content: &str) -> Vec<Figure> {
457 let mut figures = Vec::new();
458 let mut reader = make_reader(content);
459 let mut buf = Vec::new();
460
461 loop {
462 let attrs = match reader.read_event_into(&mut buf) {
463 Ok(Event::Start(ref e)) if e.name().as_ref() == b"fig" => Some(FigAttrs {
464 id: get_attr(e, b"id"),
465 fig_type: get_attr(e, b"fig-type"),
466 }),
467 Ok(Event::Eof) => break,
468 Err(_) => break,
469 _ => None,
470 };
471 buf.clear();
472
473 if let Some(attrs) = attrs
474 && let Some(fig) = parse_figure_inner(&mut reader, attrs, &mut buf)
475 {
476 figures.push(fig);
477 }
478 }
479
480 figures
481}
482
483fn extract_tables_from_content(content: &str) -> Vec<Table> {
485 let mut tables = Vec::new();
486 let mut reader = make_reader(content);
487 let mut buf = Vec::new();
488
489 loop {
490 let attrs = match reader.read_event_into(&mut buf) {
491 Ok(Event::Start(ref e)) if e.name().as_ref() == b"table-wrap" => Some(TableAttrs {
492 id: get_attr(e, b"id"),
493 }),
494 Ok(Event::Eof) => break,
495 Err(_) => break,
496 _ => None,
497 };
498 buf.clear();
499
500 if let Some(attrs) = attrs
501 && let Some(table) = parse_table_inner(&mut reader, attrs, &mut buf)
502 {
503 tables.push(table);
504 }
505 }
506
507 tables
508}
509
510struct FigAttrs {
511 id: Option<String>,
512 fig_type: Option<String>,
513}
514
515struct TableAttrs {
516 id: Option<String>,
517}
518
519fn parse_figure_inner(
521 reader: &mut quick_xml::Reader<&[u8]>,
522 attrs: FigAttrs,
523 buf: &mut Vec<u8>,
524) -> Option<Figure> {
525 let mut label: Option<String> = None;
526 let mut caption: Option<String> = None;
527 let mut alt_text: Option<String> = None;
528 let mut file_name: Option<String> = None;
529
530 loop {
531 let action = match reader.read_event_into(buf) {
532 Ok(Event::Start(ref e)) => match e.name().as_ref() {
533 b"label" => FigAction::ReadLabel,
534 b"caption" => FigAction::ReadCaption,
535 b"alt-text" => FigAction::ReadAltText,
536 b"graphic" => {
537 let href = get_attr(e, b"xlink:href").or_else(|| get_attr(e, b"href"));
538 FigAction::ReadGraphic(href)
539 }
540 other => FigAction::Skip(other.to_vec()),
541 },
542 Ok(Event::End(ref e)) if e.name().as_ref() == b"fig" => FigAction::Done,
543 Ok(Event::Eof) => FigAction::Done,
544 Err(_) => FigAction::Done,
545 _ => FigAction::Continue,
546 };
547 buf.clear();
548
549 match action {
550 FigAction::ReadLabel => {
551 label = read_text_content(reader, b"label", buf).ok();
552 }
553 FigAction::ReadCaption => {
554 caption = Some(
555 read_text_content(reader, b"caption", buf)
556 .unwrap_or_else(|_| "No caption available".to_string()),
557 );
558 }
559 FigAction::ReadAltText => {
560 alt_text = read_text_content(reader, b"alt-text", buf).ok();
561 }
562 FigAction::ReadGraphic(href) => {
563 file_name = href;
564 let _ = skip_element(reader, QName(b"graphic"), buf);
565 }
566 FigAction::Skip(name) => {
567 let _ = skip_element(reader, QName(&name), buf);
568 }
569 FigAction::Done => break,
570 FigAction::Continue => {}
571 }
572 }
573
574 Some(Figure {
575 id: attrs.id.unwrap_or_else(|| "fig_unknown".to_string()),
576 label,
577 caption: caption.unwrap_or_else(|| "No caption available".to_string()),
578 alt_text,
579 fig_type: attrs.fig_type,
580 graphic_href: file_name,
581 })
582}
583
584enum FigAction {
585 Continue,
586 Done,
587 ReadLabel,
588 ReadCaption,
589 ReadAltText,
590 ReadGraphic(Option<String>),
591 Skip(Vec<u8>),
592}
593
594fn parse_table_inner(
596 reader: &mut quick_xml::Reader<&[u8]>,
597 attrs: TableAttrs,
598 buf: &mut Vec<u8>,
599) -> Option<Table> {
600 let mut label: Option<String> = None;
601 let mut caption: Option<String> = None;
602 let mut footnotes = Vec::new();
603
604 loop {
605 let action = match reader.read_event_into(buf) {
606 Ok(Event::Start(ref e)) => match e.name().as_ref() {
607 b"label" => TableAction::ReadLabel,
608 b"caption" => TableAction::ReadCaption,
609 b"table-wrap-foot" => TableAction::ReadFootnote,
610 other => TableAction::Skip(other.to_vec()),
611 },
612 Ok(Event::End(ref e)) if e.name().as_ref() == b"table-wrap" => TableAction::Done,
613 Ok(Event::Eof) => TableAction::Done,
614 Err(_) => TableAction::Done,
615 _ => TableAction::Continue,
616 };
617 buf.clear();
618
619 match action {
620 TableAction::ReadLabel => {
621 label = read_text_content(reader, b"label", buf).ok();
622 }
623 TableAction::ReadCaption => {
624 caption = Some(
625 read_text_content(reader, b"caption", buf)
626 .unwrap_or_else(|_| "No caption available".to_string()),
627 );
628 }
629 TableAction::ReadFootnote => {
630 if let Ok(text) = read_text_content(reader, b"table-wrap-foot", buf) {
631 let trimmed = text.trim().to_string();
632 if !trimmed.is_empty() {
633 footnotes.push(trimmed);
634 }
635 }
636 }
637 TableAction::Skip(name) => {
638 let _ = skip_element(reader, QName(&name), buf);
639 }
640 TableAction::Done => break,
641 TableAction::Continue => {}
642 }
643 }
644
645 Some(Table {
646 id: attrs.id.unwrap_or_else(|| "table_unknown".to_string()),
647 label,
648 caption: caption.unwrap_or_else(|| "No caption available".to_string()),
649 head: Vec::new(),
650 body: Vec::new(),
651 footnotes,
652 })
653}
654
655enum TableAction {
656 Continue,
657 Done,
658 ReadLabel,
659 ReadCaption,
660 ReadFootnote,
661 Skip(Vec<u8>),
662}
663
664pub fn extract_section_title(content: &str) -> Option<String> {
666 xml_utils::extract_text_between(content, "<title>", "</title>")
667}
668
669pub fn extract_section_id(content: &str) -> Option<String> {
671 xml_utils::extract_attribute_value(content, "id")
672}
673
674pub fn extract_paragraph_content(content: &str) -> Vec<String> {
676 let mut paragraphs = Vec::new();
677 let mut reader = make_reader(content);
678 let mut buf = Vec::new();
679
680 loop {
681 let is_p = match reader.read_event_into(&mut buf) {
682 Ok(Event::Start(ref e)) if e.name().as_ref() == b"p" => true,
683 Ok(Event::Eof) => break,
684 Err(_) => break,
685 _ => false,
686 };
687 buf.clear();
688
689 if is_p && let Ok(text) = read_text_content(&mut reader, b"p", &mut buf) {
690 let trimmed = text.trim().to_string();
691 if !trimmed.is_empty() {
692 paragraphs.push(trimmed);
693 }
694 }
695 }
696
697 paragraphs
698}
699
700#[cfg(test)]
701mod tests {
702 use super::*;
703
704 #[test]
705 fn test_extract_abstract_section() {
706 let content = r#"
707 <abstract>
708 <p>This is an abstract paragraph.</p>
709 </abstract>
710 "#;
711
712 let section = extract_abstract_section(content);
713 assert!(section.is_some());
714
715 let section = section.unwrap();
716 assert_eq!(section.section_type, Some("abstract".to_string()));
717 assert_eq!(section.title, Some("Abstract".to_string()));
718 assert!(section.content.contains("This is an abstract paragraph."));
719 }
720
721 #[test]
722 fn test_extract_section_title() {
723 let content = r#"<sec id="sec1"><title>Introduction</title><p>Content</p></sec>"#;
724 let title = extract_section_title(content);
725 assert_eq!(title, Some("Introduction".to_string()));
726 }
727
728 #[test]
729 fn test_extract_section_id() {
730 let content = r#"<sec id="sec1"><title>Introduction</title><p>Content</p></sec>"#;
731 let id = extract_section_id(content);
732 assert_eq!(id, Some("sec1".to_string()));
733 }
734
735 #[test]
736 fn test_extract_paragraph_content() {
737 let content = r#"
738 <p>First paragraph.</p>
739 <p>Second paragraph with <em>emphasis</em>.</p>
740 "#;
741
742 let paragraphs = extract_paragraph_content(content);
743 assert_eq!(paragraphs.len(), 2);
744 assert_eq!(paragraphs[0], "First paragraph.");
745 assert_eq!(paragraphs[1], "Second paragraph with emphasis.");
746 }
747
748 #[test]
749 fn test_extract_figures_from_section() {
750 let content = r#"
751 <fig id="fig1" fig-type="diagram">
752 <label>Figure 1</label>
753 <caption>This is a test figure.</caption>
754 <alt-text>Alternative text</alt-text>
755 </fig>
756 "#;
757
758 let figures = extract_figures_from_content(content);
759 assert_eq!(figures.len(), 1);
760 assert_eq!(figures[0].id, "fig1");
761 assert_eq!(figures[0].label, Some("Figure 1".to_string()));
762 assert_eq!(figures[0].caption, "This is a test figure.");
763 assert_eq!(figures[0].alt_text, Some("Alternative text".to_string()));
764 assert_eq!(figures[0].fig_type, Some("diagram".to_string()));
765 }
766
767 #[test]
768 fn test_extract_tables_from_section() {
769 let content = r#"
770 <root>
771 <table-wrap id="table1">
772 <label>Table 1</label>
773 <caption>This is a test table.</caption>
774 <table>
775 <tr><th>Header</th></tr>
776 <tr><td>Data</td></tr>
777 </table>
778 </table-wrap>
779 </root>
780 "#;
781
782 let tables = extract_tables_from_content(content);
783 assert_eq!(tables.len(), 1);
784 assert_eq!(tables[0].id, "table1");
785 assert_eq!(tables[0].label, Some("Table 1".to_string()));
786 assert_eq!(tables[0].caption, "This is a test table.");
787 }
788
789 #[test]
790 fn test_nested_sections_depth() {
791 let content = r#"
792 <body>
793 <sec id="sec1">
794 <title>Methods</title>
795 <sec id="sec1.1">
796 <title>Study Design</title>
797 <p>Inner content.</p>
798 </sec>
799 <p>Outer content after subsection.</p>
800 </sec>
801 </body>
802 "#;
803
804 let sections = extract_sections_enhanced(content);
805 assert_eq!(sections.len(), 1);
806
807 let methods = §ions[0];
808 assert_eq!(methods.title, Some("Methods".to_string()));
809 assert!(methods.content.contains("Outer content"));
810 assert_eq!(methods.subsections.len(), 1);
811 assert_eq!(
812 methods.subsections[0].title,
813 Some("Study Design".to_string())
814 );
815 assert!(methods.subsections[0].content.contains("Inner content"));
816 }
817
818 #[test]
819 fn test_body_without_sections() {
820 let content = r#"
821 <body>
822 <p>Just a paragraph.</p>
823 <p>Another paragraph.</p>
824 </body>
825 "#;
826
827 let sections = extract_sections_enhanced(content);
828 assert_eq!(sections.len(), 1);
829 assert_eq!(sections[0].section_type, Some("body".to_string()));
830 assert!(sections[0].content.contains("Just a paragraph."));
831 assert!(sections[0].content.contains("Another paragraph."));
832 }
833
834 #[test]
835 fn test_inline_figure_in_paragraph() {
836 let content = r#"
837 <body>
838 <p>Some text <fig id="fig1"><label>Figure 1</label><caption>Test caption</caption><graphic xlink:href="fig1.jpg"/></fig> more text.</p>
839 </body>
840 "#;
841
842 let sections = extract_sections_enhanced(content);
843 assert_eq!(sections.len(), 1);
844 assert!(
846 !sections[0].figures.is_empty(),
847 "Expected figures to be extracted from inline position"
848 );
849 assert_eq!(sections[0].figures[0].id, "fig1");
850 }
851
852 #[test]
855 fn test_list_text_extraction_in_section() {
856 let content = r#"
857 <body>
858 <sec id="sec1">
859 <title>Methods</title>
860 <p>Before list.</p>
861 <list list-type="bullet">
862 <list-item><p>First item</p></list-item>
863 <list-item><p>Second item</p></list-item>
864 </list>
865 <p>After list.</p>
866 </sec>
867 </body>
868 "#;
869
870 let sections = extract_sections_enhanced(content);
871 assert_eq!(sections.len(), 1);
872 let section = §ions[0];
873 assert!(section.content.contains("Before list."));
874 assert!(section.content.contains("First item"));
875 assert!(section.content.contains("Second item"));
876 assert!(section.content.contains("After list."));
877 }
878
879 #[test]
880 fn test_def_list_text_extraction_in_section() {
881 let content = r#"
882 <body>
883 <sec id="sec1">
884 <title>Abbreviations</title>
885 <def-list>
886 <def-item>
887 <term>DNA</term>
888 <def><p>Deoxyribonucleic acid</p></def>
889 </def-item>
890 <def-item>
891 <term>RNA</term>
892 <def><p>Ribonucleic acid</p></def>
893 </def-item>
894 </def-list>
895 </sec>
896 </body>
897 "#;
898
899 let sections = extract_sections_enhanced(content);
900 assert_eq!(sections.len(), 1);
901 let section = §ions[0];
902 assert!(section.content.contains("DNA"));
903 assert!(section.content.contains("Deoxyribonucleic acid"));
904 assert!(section.content.contains("RNA"));
905 assert!(section.content.contains("Ribonucleic acid"));
906 }
907
908 #[test]
909 fn test_disp_formula_text_extraction() {
910 let content = r#"
911 <body>
912 <sec id="sec1">
913 <title>Model</title>
914 <p>The equation is:</p>
915 <disp-formula id="eq1">
916 <label>(1)</label>
917 <tex-math>E = mc^2</tex-math>
918 </disp-formula>
919 </sec>
920 </body>
921 "#;
922
923 let sections = extract_sections_enhanced(content);
924 assert_eq!(sections.len(), 1);
925 let section = §ions[0];
926 assert!(section.content.contains("The equation is:"));
927 assert!(
928 section.content.contains("E = mc^2"),
929 "Formula text should be extracted, got: {}",
930 section.content
931 );
932 }
933
934 #[test]
935 fn test_boxed_text_extraction() {
936 let content = r#"
937 <body>
938 <sec id="sec1">
939 <title>Results</title>
940 <boxed-text>
941 <title>Key Finding</title>
942 <p>Important result goes here.</p>
943 </boxed-text>
944 </sec>
945 </body>
946 "#;
947
948 let sections = extract_sections_enhanced(content);
949 assert_eq!(sections.len(), 1);
950 let section = §ions[0];
951 assert!(
952 section.content.contains("Important result goes here."),
953 "Boxed text content should be extracted, got: {}",
954 section.content
955 );
956 }
957
958 #[test]
959 fn test_code_extraction() {
960 let content = r#"
961 <body>
962 <sec id="sec1">
963 <title>Implementation</title>
964 <code language="python">print("hello world")</code>
965 </sec>
966 </body>
967 "#;
968
969 let sections = extract_sections_enhanced(content);
970 assert_eq!(sections.len(), 1);
971 let section = §ions[0];
972 assert!(
973 section.content.contains("print(\"hello world\")"),
974 "Code content should be extracted, got: {}",
975 section.content
976 );
977 }
978
979 #[test]
980 fn test_disp_quote_extraction() {
981 let content = r#"
982 <body>
983 <sec id="sec1">
984 <title>Discussion</title>
985 <disp-quote>
986 <p>To be or not to be, that is the question.</p>
987 </disp-quote>
988 </sec>
989 </body>
990 "#;
991
992 let sections = extract_sections_enhanced(content);
993 assert_eq!(sections.len(), 1);
994 let section = §ions[0];
995 assert!(section.content.contains("To be or not to be"));
996 }
997
998 #[test]
999 fn test_preformat_extraction() {
1000 let content = r#"
1001 <body>
1002 <sec id="sec1">
1003 <title>Data</title>
1004 <preformat>
1005Column1 Column2 Column3
1006value1 value2 value3
1007 </preformat>
1008 </sec>
1009 </body>
1010 "#;
1011
1012 let sections = extract_sections_enhanced(content);
1013 assert_eq!(sections.len(), 1);
1014 let section = §ions[0];
1015 assert!(section.content.contains("Column1"));
1016 assert!(section.content.contains("value1"));
1017 }
1018
1019 #[test]
1020 fn test_mixed_elements_in_section() {
1021 let content = r#"
1022 <body>
1023 <sec id="sec1">
1024 <title>Mixed Content</title>
1025 <p>Paragraph text.</p>
1026 <list list-type="order">
1027 <list-item><p>Ordered item one</p></list-item>
1028 <list-item><p>Ordered item two</p></list-item>
1029 </list>
1030 <fig id="fig1">
1031 <label>Figure 1</label>
1032 <caption>A test figure</caption>
1033 </fig>
1034 <disp-formula id="eq1">
1035 <label>(2)</label>
1036 <tex-math>a^2 + b^2 = c^2</tex-math>
1037 </disp-formula>
1038 <p>Final paragraph.</p>
1039 </sec>
1040 </body>
1041 "#;
1042
1043 let sections = extract_sections_enhanced(content);
1044 assert_eq!(sections.len(), 1);
1045 let section = §ions[0];
1046 assert!(section.content.contains("Paragraph text."));
1047 assert!(section.content.contains("Ordered item one"));
1048 assert!(section.content.contains("a^2 + b^2 = c^2"));
1049 assert!(section.content.contains("Final paragraph."));
1050 assert_eq!(section.figures.len(), 1);
1051 assert_eq!(section.figures[0].id, "fig1");
1052 }
1053
1054 #[test]
1055 fn test_body_without_sec_with_list() {
1056 let content = r#"
1057 <body>
1058 <p>Introduction paragraph.</p>
1059 <list list-type="bullet">
1060 <list-item><p>Bullet point one</p></list-item>
1061 <list-item><p>Bullet point two</p></list-item>
1062 </list>
1063 <p>Conclusion paragraph.</p>
1064 </body>
1065 "#;
1066
1067 let sections = extract_sections_enhanced(content);
1068 assert_eq!(sections.len(), 1);
1069 assert_eq!(sections[0].section_type, Some("body".to_string()));
1070 assert!(sections[0].content.contains("Introduction paragraph."));
1071 assert!(sections[0].content.contains("Bullet point one"));
1072 assert!(sections[0].content.contains("Bullet point two"));
1073 assert!(sections[0].content.contains("Conclusion paragraph."));
1074 }
1075
1076 #[test]
1077 fn test_media_in_section() {
1078 let content = r#"
1079 <body>
1080 <sec id="sec1">
1081 <title>Supplementary</title>
1082 <media mimetype="video" xlink:href="movie1.mp4">
1083 <caption><p>Supplementary Movie 1</p></caption>
1084 </media>
1085 </sec>
1086 </body>
1087 "#;
1088
1089 let sections = extract_sections_enhanced(content);
1090 assert_eq!(sections.len(), 1);
1091 let section = §ions[0];
1092 assert!(
1093 section.content.contains("Supplementary Movie 1"),
1094 "Media caption should be extracted, got: {}",
1095 section.content
1096 );
1097 }
1098
1099 #[test]
1100 fn test_fn_group_in_section() {
1101 let content = r#"
1102 <body>
1103 <sec id="sec1">
1104 <title>Notes</title>
1105 <p>Main text.</p>
1106 <fn-group>
1107 <fn id="fn1"><p>Author contribution note.</p></fn>
1108 <fn id="fn2"><p>Funding disclosure.</p></fn>
1109 </fn-group>
1110 </sec>
1111 </body>
1112 "#;
1113
1114 let sections = extract_sections_enhanced(content);
1115 assert_eq!(sections.len(), 1);
1116 let section = §ions[0];
1117 assert!(section.content.contains("Main text."));
1118 assert!(
1119 section.content.contains("Author contribution note."),
1120 "fn-group content should be extracted, got: {}",
1121 section.content
1122 );
1123 }
1124}