1use std::collections::HashMap;
7
8use serde::Serialize;
9
10use pubmed_parser::common::{Author, PublicationDate};
11use pubmed_parser::pmc::{Figure, FundingInfo, PmcArticle, Reference, Section, Table};
12
13static HTML_ENTITIES: &[(&str, &str)] = &[
15 ("&", "&"),
17 ("<", "<"),
18 (">", ">"),
19 (""", "\""),
20 ("'", "'"),
21 ("'", "'"),
22 ("’", "'"), ("‘", "'"), ("“", "\""), ("”", "\""), ("’", "'"), ("‘", "'"), ("”", "\""), ("“", "\""), ("–", "-"), ("—", "--"), (" ", " "), (" ", " "), ("–", "-"), ("—", "--"), ("…", "..."), ("…", "..."), ("™", "(TM)"), ("®", "(R)"), ("©", "(C)"), ("™", "(TM)"), ("®", "(R)"), ("©", "(C)"), ("€", "EUR"), ("£", "GBP"), ("¥", "JPY"), ("−", "-"), ("×", "x"), ("÷", "/"), ("±", "±"), ("×", "x"), ("÷", "/"), ("±", "±"), ("α", "α"), ("β", "β"), ("γ", "γ"), ("δ", "δ"), ("ε", "ε"), ("μ", "μ"), ("π", "π"), ("σ", "σ"), ("α", "α"), ("β", "β"), ("γ", "γ"), ("δ", "δ"), ("ε", "ε"), ("μ", "μ"), ("π", "π"), ("σ", "σ"), ];
78
79#[derive(Debug, Clone, Serialize)]
81struct ArticleMetadata {
82 title: String,
83 #[serde(skip_serializing_if = "Vec::is_empty")]
84 authors: Vec<String>,
85 journal: String,
86 #[serde(skip_serializing_if = "Option::is_none")]
87 journal_abbrev: Option<String>,
88 #[serde(skip_serializing_if = "Option::is_none")]
89 pub_date: Option<String>,
90 pmcid: String,
91 #[serde(skip_serializing_if = "Option::is_none")]
92 pmid: Option<String>,
93 #[serde(skip_serializing_if = "Option::is_none")]
94 doi: Option<String>,
95 #[serde(skip_serializing_if = "Option::is_none")]
96 article_type: Option<String>,
97 #[serde(skip_serializing_if = "Vec::is_empty")]
98 keywords: Vec<String>,
99 #[serde(skip_serializing_if = "Option::is_none")]
100 volume: Option<String>,
101 #[serde(skip_serializing_if = "Option::is_none")]
102 issue: Option<String>,
103 #[serde(skip_serializing_if = "Option::is_none")]
104 publisher: Option<String>,
105}
106
107#[derive(Debug, Clone)]
109pub struct MarkdownConfig {
110 pub include_metadata: bool,
112 pub include_toc: bool,
114 pub heading_style: HeadingStyle,
116 pub reference_style: ReferenceStyle,
118 pub max_heading_level: u8,
120 pub include_orcid_links: bool,
122 pub include_identifier_links: bool,
124 pub include_figure_captions: bool,
126 pub include_local_figures: bool,
128 pub use_yaml_frontmatter: bool,
130}
131
132#[derive(Debug, Clone, PartialEq)]
134pub enum HeadingStyle {
135 ATX,
137 Setext,
139}
140
141#[derive(Debug, Clone, PartialEq)]
143pub enum ReferenceStyle {
144 Numbered,
146 AuthorYear,
148 FullCitation,
150}
151
152impl Default for MarkdownConfig {
153 fn default() -> Self {
154 Self {
155 include_metadata: true,
156 include_toc: false,
157 heading_style: HeadingStyle::ATX,
158 reference_style: ReferenceStyle::Numbered,
159 max_heading_level: 6,
160 include_orcid_links: true,
161 include_identifier_links: true,
162 include_figure_captions: true,
163 include_local_figures: false,
164 use_yaml_frontmatter: false,
165 }
166 }
167}
168
169fn format_first_pub_date(dates: &[PublicationDate]) -> Option<String> {
171 let d = dates.first()?;
172 let year = d.year?;
173 match (d.month, d.day) {
174 (Some(m), Some(day)) => Some(format!("{year}-{m:02}-{day:02}")),
175 (Some(m), None) => Some(format!("{year}-{m:02}")),
176 _ => Some(year.to_string()),
177 }
178}
179
180pub struct PmcMarkdownConverter {
182 config: MarkdownConfig,
183}
184
185impl PmcMarkdownConverter {
186 pub fn new() -> Self {
188 Self {
189 config: MarkdownConfig::default(),
190 }
191 }
192
193 pub fn with_config(config: MarkdownConfig) -> Self {
195 Self { config }
196 }
197
198 pub fn with_include_metadata(mut self, include: bool) -> Self {
200 self.config.include_metadata = include;
201 self
202 }
203
204 pub fn with_include_toc(mut self, include: bool) -> Self {
206 self.config.include_toc = include;
207 self
208 }
209
210 pub fn with_heading_style(mut self, style: HeadingStyle) -> Self {
212 self.config.heading_style = style;
213 self
214 }
215
216 pub fn with_reference_style(mut self, style: ReferenceStyle) -> Self {
218 self.config.reference_style = style;
219 self
220 }
221
222 pub fn with_max_heading_level(mut self, level: u8) -> Self {
224 self.config.max_heading_level = level.clamp(1, 6);
225 self
226 }
227
228 pub fn with_include_orcid_links(mut self, include: bool) -> Self {
230 self.config.include_orcid_links = include;
231 self
232 }
233
234 pub fn with_include_identifier_links(mut self, include: bool) -> Self {
236 self.config.include_identifier_links = include;
237 self
238 }
239
240 pub fn with_include_figure_captions(mut self, include: bool) -> Self {
242 self.config.include_figure_captions = include;
243 self
244 }
245
246 pub fn with_yaml_frontmatter(mut self, use_yaml: bool) -> Self {
248 self.config.use_yaml_frontmatter = use_yaml;
249 self
250 }
251
252 pub fn convert_with_figures(
254 &self,
255 article: &PmcArticle,
256 figure_paths: Option<&HashMap<String, String>>,
257 ) -> String {
258 let mut markdown = String::new();
259
260 if self.config.include_metadata {
262 markdown.push_str(&self.convert_metadata(article));
263 markdown.push_str("\n\n");
264 } else {
265 markdown.push_str(&self.format_heading(&self.clean_content(&article.title), 1));
267 markdown.push_str("\n\n");
268 }
269
270 if self.config.include_toc {
272 markdown.push_str(&self.convert_toc(article));
273 markdown.push_str("\n\n");
274 }
275
276 markdown.push_str(&self.convert_sections_with_figures(&article.sections, 1, figure_paths));
278
279 if !article.references.is_empty() {
281 markdown.push_str(&self.convert_references(&article.references));
282 }
283
284 markdown.push_str(&self.convert_additional_sections(article));
286
287 markdown.trim().to_string()
288 }
289
290 pub fn convert(&self, article: &PmcArticle) -> String {
292 let mut markdown = String::new();
293
294 if self.config.include_metadata {
296 markdown.push_str(&self.convert_metadata(article));
297 markdown.push_str("\n\n");
298 } else {
299 markdown.push_str(&self.format_heading(&self.clean_content(&article.title), 1));
301 markdown.push_str("\n\n");
302 }
303
304 if self.config.include_toc {
306 markdown.push_str(&self.convert_toc(article));
307 markdown.push_str("\n\n");
308 }
309
310 markdown.push_str(&self.convert_sections(&article.sections, 1));
312
313 if !article.references.is_empty() {
315 markdown.push_str(&self.convert_references(&article.references));
316 }
317
318 markdown.push_str(&self.convert_additional_sections(article));
320
321 markdown.trim().to_string()
322 }
323
324 fn generate_yaml_frontmatter(&self, article: &PmcArticle) -> String {
326 let metadata = ArticleMetadata {
328 title: self.clean_content(&article.title),
329 authors: article
330 .authors
331 .iter()
332 .map(|a| self.clean_content(&a.full_name))
333 .collect(),
334 journal: self.clean_content(&article.journal.title),
335 journal_abbrev: article
336 .journal
337 .abbreviation
338 .as_ref()
339 .map(|a| self.clean_content(a)),
340 pub_date: format_first_pub_date(&article.pub_dates),
341 pmcid: article.pmcid.as_str(),
342 pmid: article.pmid.as_ref().map(|p| p.as_str()),
343 doi: article.doi.as_ref().map(|d| self.clean_content(d)),
344 article_type: article.article_type.as_ref().map(|t| self.clean_content(t)),
345 keywords: article
346 .keywords
347 .iter()
348 .map(|k| self.clean_content(k))
349 .collect(),
350 volume: article.volume.as_ref().map(|v| self.clean_content(v)),
351 issue: article.issue.as_ref().map(|i| self.clean_content(i)),
352 publisher: article
353 .journal
354 .publisher
355 .as_ref()
356 .map(|p| self.clean_content(p)),
357 };
358
359 match serde_yaml::to_string(&metadata) {
361 Ok(yaml_content) => format!("---\n{}---\n", yaml_content),
362 Err(e) => {
363 tracing::warn!("Failed to serialize YAML frontmatter: {}", e);
364 "---\n---\n".to_string()
366 }
367 }
368 }
369
370 fn convert_metadata(&self, article: &PmcArticle) -> String {
372 if self.config.use_yaml_frontmatter {
374 return self.generate_yaml_frontmatter(article);
375 }
376
377 let mut metadata = String::new();
378
379 metadata.push_str(&self.format_heading(&self.clean_content(&article.title), 1));
381 metadata.push('\n');
382
383 if !article.authors.is_empty() {
385 metadata.push_str("\n**Authors:** ");
386 metadata.push_str(&self.format_authors(&article.authors));
387 metadata.push('\n');
388 }
389
390 let journal_title = &article.journal.title;
392 metadata.push_str(&format!("\n**Journal:** {journal_title}"));
393 if let Some(abbrev) = &article.journal.abbreviation {
394 metadata.push_str(&format!(" ({abbrev})"));
395 }
396 metadata.push('\n');
397
398 if let Some(pub_date) = format_first_pub_date(&article.pub_dates) {
400 metadata.push_str(&format!("**Published:** {pub_date}\n"));
401 }
402
403 let mut identifiers = Vec::new();
405 if let Some(doi) = &article.doi {
406 if self.config.include_identifier_links {
407 identifiers.push(format!("[DOI: {doi}](https://doi.org/{doi})"));
408 } else {
409 identifiers.push(format!("DOI: {doi}"));
410 }
411 }
412 if let Some(pmid) = &article.pmid {
413 let pmid_str = pmid.as_str();
414 if self.config.include_identifier_links {
415 identifiers.push(format!(
416 "[PMID: {pmid_str}](https://pubmed.ncbi.nlm.nih.gov/{pmid_str})"
417 ));
418 } else {
419 identifiers.push(format!("PMID: {pmid_str}"));
420 }
421 }
422 let pmcid = article.pmcid.as_str();
423 identifiers.push(format!("PMC: {pmcid}"));
424
425 if !identifiers.is_empty() {
426 let identifiers_str = identifiers.join(" | ");
427 metadata.push_str(&format!("**Identifiers:** {identifiers_str}\n"));
428 }
429
430 if let Some(article_type) = &article.article_type {
432 metadata.push_str(&format!("**Article Type:** {article_type}\n"));
433 }
434
435 if !article.keywords.is_empty() {
437 let clean_keywords: Vec<String> = article
438 .keywords
439 .iter()
440 .map(|k| self.clean_content(k))
441 .collect();
442 let keywords_str = clean_keywords.join(", ");
443 metadata.push_str(&format!("**Keywords:** {keywords_str}\n"));
444 }
445
446 let mut journal_details = Vec::new();
448 if let Some(volume) = &article.volume {
449 journal_details.push(format!("Volume {volume}"));
450 }
451 if let Some(issue) = &article.issue {
452 journal_details.push(format!("Issue {issue}"));
453 }
454 if let Some(publisher) = &article.journal.publisher {
455 journal_details.push(format!("Publisher: {publisher}"));
456 }
457 if !journal_details.is_empty() {
458 metadata.push_str(&format!(
459 "**Journal Details:** {}\n",
460 journal_details.join(" | ")
461 ));
462 }
463
464 metadata
465 }
466
467 fn convert_toc(&self, article: &PmcArticle) -> String {
469 let mut toc = String::new();
470 toc.push_str(&self.format_heading("Table of Contents", 2));
471 toc.push('\n');
472
473 for (i, section) in article.sections.iter().enumerate() {
474 let default_title = "Untitled".to_string();
475 let title = section.title.as_ref().unwrap_or(&default_title);
476 let anchor = self.create_anchor(title);
477 let index = i + 1;
478 toc.push_str(&format!("{index}. [{title}](#{anchor})\n"));
479
480 for (j, subsection) in section.subsections.iter().enumerate() {
482 let default_sub_title = "Untitled".to_string();
483 let sub_title = subsection.title.as_ref().unwrap_or(&default_sub_title);
484 let sub_anchor = self.create_anchor(sub_title);
485 let main_index = i + 1;
486 let sub_index = j + 1;
487 toc.push_str(&format!(
488 " {main_index}.{sub_index}. [{sub_title}](#{sub_anchor})\n"
489 ));
490 }
491 }
492
493 toc
494 }
495
496 fn convert_sections_with_figures(
498 &self,
499 sections: &[Section],
500 level: u8,
501 figure_paths: Option<&HashMap<String, String>>,
502 ) -> String {
503 let mut content = String::new();
504
505 for section in sections {
506 if let Some(title) = §ion.title {
508 content.push_str(&self.format_heading(title, level));
509 content.push_str("\n\n");
510 }
511
512 if !section.content.is_empty() {
514 content.push_str(&self.clean_content(§ion.content));
515 content.push_str("\n\n");
516 }
517
518 if self.config.include_figure_captions {
520 for figure in §ion.figures {
521 let figure_path = figure_paths.and_then(|paths| paths.get(&figure.id));
522 content.push_str(&self.convert_figure_with_path(figure, figure_path));
523 content.push_str("\n\n");
524 }
525 }
526
527 if self.config.include_figure_captions {
529 for table in §ion.tables {
530 content.push_str(&self.convert_table(table));
531 content.push_str("\n\n");
532 }
533 }
534
535 if !section.subsections.is_empty() {
537 let next_level = (level + 1).min(self.config.max_heading_level);
538 content.push_str(&self.convert_sections_with_figures(
539 §ion.subsections,
540 next_level,
541 figure_paths,
542 ));
543 }
544 }
545
546 content
547 }
548
549 fn convert_sections(&self, sections: &[Section], level: u8) -> String {
551 let mut content = String::new();
552
553 for section in sections {
554 if let Some(title) = §ion.title {
556 content.push_str(&self.format_heading(title, level));
557 content.push_str("\n\n");
558 }
559
560 if !section.content.is_empty() {
562 content.push_str(&self.clean_content(§ion.content));
563 content.push_str("\n\n");
564 }
565
566 if self.config.include_figure_captions {
568 for figure in §ion.figures {
569 content.push_str(&self.convert_figure(figure));
570 content.push_str("\n\n");
571 }
572 }
573
574 if self.config.include_figure_captions {
576 for table in §ion.tables {
577 content.push_str(&self.convert_table(table));
578 content.push_str("\n\n");
579 }
580 }
581
582 if !section.subsections.is_empty() {
584 let next_level = (level + 1).min(self.config.max_heading_level);
585 content.push_str(&self.convert_sections(§ion.subsections, next_level));
586 }
587 }
588
589 content
590 }
591
592 fn convert_references(&self, references: &[Reference]) -> String {
594 let mut content = String::new();
595 content.push_str(&self.format_heading("References", 2));
596 content.push_str("\n\n");
597
598 match self.config.reference_style {
599 ReferenceStyle::Numbered => {
600 for (i, reference) in references.iter().enumerate() {
601 content.push_str(&format!(
602 "{}. {}\n",
603 i + 1,
604 self.format_reference(reference)
605 ));
606 }
607 }
608 ReferenceStyle::AuthorYear | ReferenceStyle::FullCitation => {
609 for reference in references {
610 let formatted_ref = self.format_reference(reference);
611 content.push_str(&format!("- {formatted_ref}\n"));
612 }
613 }
614 }
615
616 content.push('\n');
617 content
618 }
619
620 fn convert_additional_sections(&self, article: &PmcArticle) -> String {
622 let mut content = String::new();
623
624 if !article.funding.is_empty() {
626 content.push_str(&self.format_heading("Funding", 2));
627 content.push_str("\n\n");
628 for funding in &article.funding {
629 content.push_str(&self.format_funding(funding));
630 content.push('\n');
631 }
632 content.push('\n');
633 }
634
635 if let Some(coi) = &article.conflict_of_interest {
637 content.push_str(&self.format_heading("Conflict of Interest", 2));
638 content.push_str("\n\n");
639 content.push_str(&self.clean_content(coi));
640 content.push_str("\n\n");
641 }
642
643 if let Some(ack) = &article.acknowledgments {
645 content.push_str(&self.format_heading("Acknowledgments", 2));
646 content.push_str("\n\n");
647 content.push_str(&self.clean_content(ack));
648 content.push_str("\n\n");
649 }
650
651 if let Some(data_avail) = &article.data_availability {
653 content.push_str(&self.format_heading("Data Availability", 2));
654 content.push_str("\n\n");
655 content.push_str(&self.clean_content(data_avail));
656 content.push_str("\n\n");
657 }
658
659 content
660 }
661
662 fn format_heading(&self, text: &str, level: u8) -> String {
664 let level = level.min(self.config.max_heading_level);
665
666 match self.config.heading_style {
667 HeadingStyle::ATX => {
668 let hashes = "#".repeat(level as usize);
669 format!("{hashes} {text}")
670 }
671 HeadingStyle::Setext => {
672 if level == 1 {
673 let underline = "=".repeat(text.len());
674 format!("{text}\n{underline}")
675 } else if level == 2 {
676 let underline = "-".repeat(text.len());
677 format!("{text}\n{underline}")
678 } else {
679 let hashes = "#".repeat(level as usize);
681 format!("{hashes} {text}")
682 }
683 }
684 }
685 }
686
687 fn format_authors(&self, authors: &[Author]) -> String {
689 authors
690 .iter()
691 .map(|author| {
692 let mut name = self.clean_content(&author.full_name);
693
694 if author.is_corresponding {
696 name.push('*');
697 }
698
699 if self.config.include_orcid_links
701 && let Some(orcid) = &author.orcid
702 {
703 let cleaned_orcid = self.clean_content(orcid);
705 let clean_orcid = cleaned_orcid.trim_start_matches("https://orcid.org/");
706
707 if clean_orcid.len() >= 19 && clean_orcid.matches('-').count() == 3 {
709 name.push_str(&format!(" ([ORCID](https://orcid.org/{clean_orcid}))"));
710 }
711 }
712
713 name
714 })
715 .collect::<Vec<String>>()
716 .join(", ")
717 }
718
719 fn format_reference(&self, reference: &Reference) -> String {
721 match self.config.reference_style {
722 ReferenceStyle::Numbered | ReferenceStyle::FullCitation => {
723 let citation = reference.format_citation();
724
725 if self.config.include_identifier_links {
726 let mut formatted = citation;
727
728 if let Some(doi) = &reference.doi {
730 formatted.push_str(&format!(" [DOI](https://doi.org/{doi})"));
731 }
732
733 if let Some(pmid) = &reference.pmid {
735 formatted
736 .push_str(&format!(" [PMID](https://pubmed.ncbi.nlm.nih.gov/{pmid})"));
737 }
738
739 formatted
740 } else {
741 citation
742 }
743 }
744 ReferenceStyle::AuthorYear => {
745 if let (Some(first_author), Some(year)) =
746 (reference.authors.first(), reference.year.as_ref())
747 {
748 format!("{} ({})", first_author.full_name, year)
749 } else {
750 reference.format_citation()
751 }
752 }
753 }
754 }
755
756 fn format_funding(&self, funding: &FundingInfo) -> String {
758 let source = &funding.source;
759 let mut text = format!("- **{source}**");
760
761 if let Some(award_id) = &funding.award_id {
762 text.push_str(&format!(" (Award ID: {award_id})"));
763 }
764
765 if let Some(statement) = &funding.statement {
766 let content = self.clean_content(statement);
767 text.push_str(&format!(": {content}"));
768 }
769
770 text
771 }
772
773 fn convert_figure_with_path(&self, figure: &Figure, figure_path: Option<&String>) -> String {
775 let mut content = String::new();
776
777 if self.config.include_local_figures
779 && let Some(path) = figure_path
780 {
781 let alt_text = figure
782 .alt_text
783 .as_deref()
784 .or(figure.label.as_deref())
785 .unwrap_or(&figure.id);
786 content.push_str(&format!("\n\n"));
787 }
788
789 if let Some(label) = &figure.label {
790 content.push_str(&format!("**{label}**"));
791 } else {
792 let figure_id = &figure.id;
793 content.push_str(&format!("**Figure {figure_id}**"));
794 }
795
796 let caption = self.clean_content(&figure.caption);
797 content.push_str(&format!(": {caption}"));
798
799 if let Some(alt_text) = &figure.alt_text {
800 let alt_content = self.clean_content(alt_text);
801 content.push_str(&format!("\n\n*Alt text: {alt_content}*"));
802 }
803
804 content
805 }
806
807 fn convert_figure(&self, figure: &Figure) -> String {
809 let mut content = String::new();
810
811 if let Some(label) = &figure.label {
812 content.push_str(&format!("**{label}**"));
813 } else {
814 let figure_id = &figure.id;
815 content.push_str(&format!("**Figure {figure_id}**"));
816 }
817
818 let caption = self.clean_content(&figure.caption);
819 content.push_str(&format!(": {caption}"));
820
821 if let Some(alt_text) = &figure.alt_text {
822 let alt_content = self.clean_content(alt_text);
823 content.push_str(&format!("\n\n*Alt text: {alt_content}*"));
824 }
825
826 content
827 }
828
829 fn convert_table(&self, table: &Table) -> String {
831 let mut content = String::new();
832
833 if let Some(label) = &table.label {
834 content.push_str(&format!("**{label}**"));
835 } else {
836 let table_id = &table.id;
837 content.push_str(&format!("**Table {table_id}**"));
838 }
839
840 let caption = self.clean_content(&table.caption);
841 content.push_str(&format!(": {caption}"));
842
843 if !table.footnotes.is_empty() {
844 content.push_str("\n\n*Footnotes:*\n");
845 for (i, footnote) in table.footnotes.iter().enumerate() {
846 let index = i + 1;
847 let footnote_content = self.clean_content(footnote);
848 content.push_str(&format!("{index}. {footnote_content}\n"));
849 }
850 }
851
852 content
853 }
854
855 fn clean_content(&self, content: &str) -> String {
857 let mut cleaned = content.to_string();
859
860 cleaned = regex::Regex::new(r"<[^>]*>")
862 .unwrap()
863 .replace_all(&cleaned, "")
864 .to_string();
865
866 for (entity, replacement) in HTML_ENTITIES {
868 cleaned = cleaned.replace(entity, replacement);
869 }
870
871 cleaned = regex::Regex::new(r"\s+")
873 .unwrap()
874 .replace_all(&cleaned, " ")
875 .trim()
876 .to_string();
877
878 cleaned
879 }
880
881 fn create_anchor(&self, title: &str) -> String {
883 title
884 .to_lowercase()
885 .chars()
886 .map(|c| if c.is_alphanumeric() { c } else { '-' })
887 .collect::<String>()
888 .split('-')
889 .filter(|s| !s.is_empty())
890 .collect::<Vec<_>>()
891 .join("-")
892 }
893}
894
895impl Default for PmcMarkdownConverter {
896 fn default() -> Self {
897 Self::new()
898 }
899}
900
901#[cfg(test)]
902mod tests {
903 use super::*;
904 use pubmed_parser::common::{Author, PmcId, PubMedId, PublicationDate};
905 use pubmed_parser::pmc::{JournalMeta, PmcArticle};
906
907 fn test_article(title: &str, pmcid: &str) -> PmcArticle {
909 PmcArticle {
910 pmcid: PmcId::parse(pmcid).unwrap(),
911 pmid: None,
912 doi: None,
913 article_type: None,
914 categories: vec![],
915 title: title.to_string(),
916 subtitle: None,
917 authors: vec![],
918 journal: JournalMeta {
919 title: "Test Journal".to_string(),
920 abbreviation: None,
921 issn_print: None,
922 issn_electronic: None,
923 publisher: None,
924 },
925 pub_dates: vec![],
926 volume: None,
927 issue: None,
928 fpage: None,
929 lpage: None,
930 elocation_id: None,
931 abstract_text: None,
932 abstract_sections: vec![],
933 keywords: vec![],
934 sections: vec![],
935 references: vec![],
936 funding: vec![],
937 acknowledgments: None,
938 conflict_of_interest: None,
939 data_availability: None,
940 supplementary_materials: vec![],
941 appendices: vec![],
942 glossary: vec![],
943 copyright: None,
944 license: None,
945 license_url: None,
946 history_dates: vec![],
947 }
948 }
949
950 #[test]
951 fn test_markdown_converter_creation() {
952 let converter = PmcMarkdownConverter::new();
953 assert!(converter.config.include_metadata);
954 assert_eq!(converter.config.heading_style, HeadingStyle::ATX);
955 assert_eq!(converter.config.reference_style, ReferenceStyle::Numbered);
956 }
957
958 #[test]
959 fn test_configuration_builder() {
960 let converter = PmcMarkdownConverter::new()
961 .with_include_metadata(false)
962 .with_heading_style(HeadingStyle::Setext)
963 .with_reference_style(ReferenceStyle::AuthorYear)
964 .with_max_heading_level(4);
965
966 assert!(!converter.config.include_metadata);
967 assert_eq!(converter.config.heading_style, HeadingStyle::Setext);
968 assert_eq!(converter.config.reference_style, ReferenceStyle::AuthorYear);
969 assert_eq!(converter.config.max_heading_level, 4);
970 }
971
972 #[test]
973 fn test_heading_formatting() {
974 let converter = PmcMarkdownConverter::new();
975
976 assert_eq!(converter.format_heading("Title", 1), "# Title");
978 assert_eq!(converter.format_heading("Subtitle", 2), "## Subtitle");
979
980 let converter = converter.with_heading_style(HeadingStyle::Setext);
982 assert_eq!(converter.format_heading("Title", 1), "Title\n=====");
983 assert_eq!(
984 converter.format_heading("Subtitle", 2),
985 "Subtitle\n--------"
986 );
987 assert_eq!(converter.format_heading("Section", 3), "### Section");
988 }
989
990 #[test]
991 fn test_clean_content() {
992 let converter = PmcMarkdownConverter::new();
993
994 let dirty = "<p>This is <em>emphasis</em> and & entities</p>";
995 let clean = converter.clean_content(dirty);
996 assert_eq!(clean, "This is emphasis and & entities");
997 }
998
999 #[test]
1000 fn test_anchor_creation() {
1001 let converter = PmcMarkdownConverter::new();
1002
1003 assert_eq!(converter.create_anchor("Introduction"), "introduction");
1004 assert_eq!(
1005 converter.create_anchor("Methods & Results"),
1006 "methods-results"
1007 );
1008 assert_eq!(
1009 converter.create_anchor("Discussion (2023)"),
1010 "discussion-2023"
1011 );
1012 }
1013
1014 #[test]
1015 fn test_basic_conversion() {
1016 let converter = PmcMarkdownConverter::new();
1017
1018 let mut article = test_article("Test Article", "PMC1234567");
1019 article.pmid = Some(PubMedId::parse("12345").unwrap());
1020 article.authors = vec![Author::from_full_name("John Doe".to_string())];
1021 article.pub_dates = vec![PublicationDate {
1022 pub_type: None,
1023 year: Some(2023),
1024 month: None,
1025 day: None,
1026 }];
1027 article.doi = Some("10.1000/test".to_string());
1028 article.article_type = Some("research-article".to_string());
1029 article.keywords = vec!["test".to_string(), "example".to_string()];
1030
1031 let markdown = converter.convert(&article);
1032 assert!(markdown.contains("# Test Article"));
1033 assert!(markdown.contains("**Authors:** John Doe"));
1034 assert!(markdown.contains("**Journal:** Test Journal"));
1035 assert!(markdown.contains("DOI: 10.1000/test"));
1036 assert!(markdown.contains("**Keywords:** test, example"));
1037 }
1038
1039 #[test]
1040 fn test_yaml_frontmatter_basic() {
1041 let converter = PmcMarkdownConverter::new().with_yaml_frontmatter(true);
1042
1043 let mut article = test_article("Test Article", "PMC1234567");
1044 article.pmid = Some(PubMedId::parse("12345").unwrap());
1045 article.authors = vec![
1046 Author::from_full_name("John Doe".to_string()),
1047 Author::from_full_name("Jane Smith".to_string()),
1048 ];
1049 article.pub_dates = vec![PublicationDate {
1050 pub_type: None,
1051 year: Some(2023),
1052 month: Some(5),
1053 day: Some(15),
1054 }];
1055 article.doi = Some("10.1000/test".to_string());
1056 article.article_type = Some("research-article".to_string());
1057 article.keywords = vec!["test".to_string(), "example".to_string()];
1058
1059 let markdown = converter.convert(&article);
1060
1061 assert!(markdown.starts_with("---\n"));
1063 let delimiter_count = markdown.matches("---").count();
1064 assert_eq!(
1065 delimiter_count, 2,
1066 "Should have opening and closing YAML frontmatter delimiters"
1067 );
1068
1069 assert!(markdown.contains("title: Test Article"));
1071 assert!(markdown.contains("authors:"));
1072 assert!(markdown.contains("- John Doe"));
1073 assert!(markdown.contains("- Jane Smith"));
1074 assert!(markdown.contains("journal: Test Journal"));
1075 assert!(
1076 markdown.contains("pub_date: '2023-05-15'")
1077 || markdown.contains("pub_date: 2023-05-15")
1078 );
1079 assert!(markdown.contains("pmcid: PMC1234567"));
1080 assert!(markdown.contains("pmid: '12345'"));
1081 assert!(markdown.contains("doi: 10.1000/test"));
1082 assert!(markdown.contains("article_type: research-article"));
1083 assert!(markdown.contains("keywords:"));
1084 assert!(markdown.contains("- test"));
1085 assert!(markdown.contains("- example"));
1086 }
1087
1088 #[test]
1089 fn test_yaml_frontmatter_with_special_characters() {
1090 let converter = PmcMarkdownConverter::new().with_yaml_frontmatter(true);
1091
1092 let mut article = test_article("COVID-19: A Comprehensive Study", "PMC7890123");
1093 article.journal.title = "Nature: Medicine & Science".to_string();
1094 article.authors = vec![Author::from_full_name("O'Brien, Michael".to_string())];
1095 article.pub_dates = vec![PublicationDate {
1096 pub_type: None,
1097 year: Some(2023),
1098 month: None,
1099 day: None,
1100 }];
1101 article.doi = Some("10.1038/s41591-023-01234-5".to_string());
1102 article.article_type = Some("research-article".to_string());
1103 article.keywords = vec![
1104 "#COVID-19".to_string(),
1105 "SARS-CoV-2".to_string(),
1106 "vaccine".to_string(),
1107 ];
1108
1109 let markdown = converter.convert(&article);
1110
1111 assert!(
1112 markdown.contains("title: 'COVID-19: A Comprehensive Study'")
1113 || markdown.contains("title: \"COVID-19: A Comprehensive Study\"")
1114 );
1115 assert!(
1116 markdown.contains("journal: 'Nature: Medicine & Science'")
1117 || markdown.contains("journal: \"Nature: Medicine & Science\"")
1118 );
1119 assert!(markdown.contains("'#COVID-19'") || markdown.contains("\"#COVID-19\""));
1120 assert!(markdown.contains("SARS-CoV-2"));
1121 }
1122
1123 #[test]
1124 fn test_yaml_frontmatter_backward_compatibility() {
1125 let converter = PmcMarkdownConverter::new();
1127 assert!(!converter.config.use_yaml_frontmatter);
1128
1129 let article = test_article("Test Article", "PMC1234567");
1130
1131 let markdown = converter.convert(&article);
1132
1133 assert!(markdown.contains("# Test Article"));
1135 assert!(markdown.contains("**Journal:** Test Journal"));
1136 assert!(!markdown.starts_with("---\n"));
1138 }
1139
1140 #[test]
1141 fn test_builder_pattern_with_yaml_frontmatter() {
1142 let converter = PmcMarkdownConverter::new()
1143 .with_yaml_frontmatter(true)
1144 .with_include_metadata(true)
1145 .with_heading_style(HeadingStyle::ATX);
1146
1147 assert!(converter.config.use_yaml_frontmatter);
1148 assert!(converter.config.include_metadata);
1149 assert_eq!(converter.config.heading_style, HeadingStyle::ATX);
1150 }
1151}