pubmed_formatter/pmc/
markdown.rs

1//! Markdown conversion functionality for PMC articles
2//!
3//! This module provides functionality to convert parsed PMC articles into
4//! well-formatted Markdown documents with configurable styling options.
5
6use std::collections::HashMap;
7
8use serde::Serialize;
9
10use pubmed_parser::common::{Author, PublicationDate};
11use pubmed_parser::pmc::{Figure, FundingInfo, PmcArticle, Reference, Section, Table};
12
13/// HTML entity mappings for common entities found in PMC articles
14static HTML_ENTITIES: &[(&str, &str)] = &[
15    // Basic HTML entities
16    ("&", "&"),
17    ("&lt;", "<"),
18    ("&gt;", ">"),
19    ("&quot;", "\""),
20    ("&#x27;", "'"),
21    ("&apos;", "'"),
22    // Quotation marks
23    ("&#8217;", "'"),  // right single quotation mark
24    ("&#8216;", "'"),  // left single quotation mark
25    ("&#8220;", "\""), // left double quotation mark
26    ("&#8221;", "\""), // right double quotation mark
27    ("&rsquo;", "'"),  // right single quote
28    ("&lsquo;", "'"),  // left single quote
29    ("&rdquo;", "\""), // right double quote
30    ("&ldquo;", "\""), // left double quote
31    // Dashes and spacing
32    ("&#8211;", "-"),  // en dash
33    ("&#8212;", "--"), // em dash
34    ("&#160;", " "),   // non-breaking space
35    ("&nbsp;", " "),   // non-breaking space
36    ("&ndash;", "-"),  // en dash
37    ("&mdash;", "--"), // em dash
38    // Special punctuation
39    ("&#8230;", "..."),  // ellipsis
40    ("&hellip;", "..."), // ellipsis
41    // Symbols
42    ("&#8482;", "(TM)"), // trademark
43    ("&#174;", "(R)"),   // registered trademark
44    ("&#169;", "(C)"),   // copyright
45    ("&trade;", "(TM)"), // trademark
46    ("&reg;", "(R)"),    // registered trademark
47    ("&copy;", "(C)"),   // copyright
48    // Currency (simplified)
49    ("&#8364;", "EUR"), // euro
50    ("&#163;", "GBP"),  // pound
51    ("&#165;", "JPY"),  // yen
52    // Mathematical symbols
53    ("&#8722;", "-"),  // minus sign
54    ("&#215;", "x"),   // multiplication sign
55    ("&#247;", "/"),   // division sign
56    ("&#177;", "±"),   // plus-minus sign
57    ("&times;", "x"),  // multiplication sign
58    ("&divide;", "/"), // division sign
59    ("&plusmn;", "±"), // plus-minus sign
60    // Greek letters (common in scientific texts)
61    ("&#945;", "α"),    // alpha
62    ("&#946;", "β"),    // beta
63    ("&#947;", "γ"),    // gamma
64    ("&#948;", "δ"),    // delta
65    ("&#949;", "ε"),    // epsilon
66    ("&#956;", "μ"),    // mu
67    ("&#960;", "π"),    // pi
68    ("&#963;", "σ"),    // sigma
69    ("&alpha;", "α"),   // alpha
70    ("&beta;", "β"),    // beta
71    ("&gamma;", "γ"),   // gamma
72    ("&delta;", "δ"),   // delta
73    ("&epsilon;", "ε"), // epsilon
74    ("&mu;", "μ"),      // mu
75    ("&pi;", "π"),      // pi
76    ("&sigma;", "σ"),   // sigma
77];
78
79/// Metadata structure for YAML frontmatter serialization
80#[derive(Debug, Clone, Serialize)]
81struct ArticleMetadata {
82    title: String,
83    #[serde(skip_serializing_if = "Vec::is_empty")]
84    authors: Vec<String>,
85    journal: String,
86    #[serde(skip_serializing_if = "Option::is_none")]
87    journal_abbrev: Option<String>,
88    #[serde(skip_serializing_if = "Option::is_none")]
89    pub_date: Option<String>,
90    pmcid: String,
91    #[serde(skip_serializing_if = "Option::is_none")]
92    pmid: Option<String>,
93    #[serde(skip_serializing_if = "Option::is_none")]
94    doi: Option<String>,
95    #[serde(skip_serializing_if = "Option::is_none")]
96    article_type: Option<String>,
97    #[serde(skip_serializing_if = "Vec::is_empty")]
98    keywords: Vec<String>,
99    #[serde(skip_serializing_if = "Option::is_none")]
100    volume: Option<String>,
101    #[serde(skip_serializing_if = "Option::is_none")]
102    issue: Option<String>,
103    #[serde(skip_serializing_if = "Option::is_none")]
104    publisher: Option<String>,
105}
106
107/// Configuration options for Markdown conversion
108#[derive(Debug, Clone)]
109pub struct MarkdownConfig {
110    /// Include metadata section at the top
111    pub include_metadata: bool,
112    /// Include table of contents
113    pub include_toc: bool,
114    /// Heading style preference
115    pub heading_style: HeadingStyle,
116    /// Reference formatting style
117    pub reference_style: ReferenceStyle,
118    /// Maximum heading level (1-6)
119    pub max_heading_level: u8,
120    /// Include author ORCID links
121    pub include_orcid_links: bool,
122    /// Include DOI and PMID links
123    pub include_identifier_links: bool,
124    /// Include figure and table captions
125    pub include_figure_captions: bool,
126    /// Include local figure file paths in markdown images
127    pub include_local_figures: bool,
128    /// Use YAML frontmatter for metadata instead of bold markdown format
129    pub use_yaml_frontmatter: bool,
130}
131
132/// Heading style options
133#[derive(Debug, Clone, PartialEq)]
134pub enum HeadingStyle {
135    /// ATX style headers (# ## ###)
136    ATX,
137    /// Setext style headers (underlined)
138    Setext,
139}
140
141/// Reference formatting style
142#[derive(Debug, Clone, PartialEq)]
143pub enum ReferenceStyle {
144    /// Numbered references \[1\], \[2\], etc.
145    Numbered,
146    /// Author-year style (Smith, 2023)
147    AuthorYear,
148    /// Full citation format
149    FullCitation,
150}
151
152impl Default for MarkdownConfig {
153    fn default() -> Self {
154        Self {
155            include_metadata: true,
156            include_toc: false,
157            heading_style: HeadingStyle::ATX,
158            reference_style: ReferenceStyle::Numbered,
159            max_heading_level: 6,
160            include_orcid_links: true,
161            include_identifier_links: true,
162            include_figure_captions: true,
163            include_local_figures: false,
164            use_yaml_frontmatter: false,
165        }
166    }
167}
168
169/// Format the first publication date as a "YYYY-MM-DD" / "YYYY-MM" / "YYYY" string.
170fn format_first_pub_date(dates: &[PublicationDate]) -> Option<String> {
171    let d = dates.first()?;
172    let year = d.year?;
173    match (d.month, d.day) {
174        (Some(m), Some(day)) => Some(format!("{year}-{m:02}-{day:02}")),
175        (Some(m), None) => Some(format!("{year}-{m:02}")),
176        _ => Some(year.to_string()),
177    }
178}
179
180/// PMC to Markdown converter
181pub struct PmcMarkdownConverter {
182    config: MarkdownConfig,
183}
184
185impl PmcMarkdownConverter {
186    /// Create a new converter with default configuration
187    pub fn new() -> Self {
188        Self {
189            config: MarkdownConfig::default(),
190        }
191    }
192
193    /// Create a converter with custom configuration
194    pub fn with_config(config: MarkdownConfig) -> Self {
195        Self { config }
196    }
197
198    /// Set whether to include metadata
199    pub fn with_include_metadata(mut self, include: bool) -> Self {
200        self.config.include_metadata = include;
201        self
202    }
203
204    /// Set whether to include table of contents
205    pub fn with_include_toc(mut self, include: bool) -> Self {
206        self.config.include_toc = include;
207        self
208    }
209
210    /// Set heading style
211    pub fn with_heading_style(mut self, style: HeadingStyle) -> Self {
212        self.config.heading_style = style;
213        self
214    }
215
216    /// Set reference style
217    pub fn with_reference_style(mut self, style: ReferenceStyle) -> Self {
218        self.config.reference_style = style;
219        self
220    }
221
222    /// Set maximum heading level
223    pub fn with_max_heading_level(mut self, level: u8) -> Self {
224        self.config.max_heading_level = level.clamp(1, 6);
225        self
226    }
227
228    /// Set whether to include ORCID links
229    pub fn with_include_orcid_links(mut self, include: bool) -> Self {
230        self.config.include_orcid_links = include;
231        self
232    }
233
234    /// Set whether to include identifier links
235    pub fn with_include_identifier_links(mut self, include: bool) -> Self {
236        self.config.include_identifier_links = include;
237        self
238    }
239
240    /// Set whether to include figure captions
241    pub fn with_include_figure_captions(mut self, include: bool) -> Self {
242        self.config.include_figure_captions = include;
243        self
244    }
245
246    /// Set whether to use YAML frontmatter for metadata
247    pub fn with_yaml_frontmatter(mut self, use_yaml: bool) -> Self {
248        self.config.use_yaml_frontmatter = use_yaml;
249        self
250    }
251
252    /// Convert a PMC article to Markdown with optional figure paths
253    pub fn convert_with_figures(
254        &self,
255        article: &PmcArticle,
256        figure_paths: Option<&HashMap<String, String>>,
257    ) -> String {
258        let mut markdown = String::new();
259
260        // Add metadata section
261        if self.config.include_metadata {
262            markdown.push_str(&self.convert_metadata(article));
263            markdown.push_str("\n\n");
264        } else {
265            // Always include at least the title even when metadata is disabled
266            markdown.push_str(&self.format_heading(&self.clean_content(&article.title), 1));
267            markdown.push_str("\n\n");
268        }
269
270        // Add table of contents if requested
271        if self.config.include_toc {
272            markdown.push_str(&self.convert_toc(article));
273            markdown.push_str("\n\n");
274        }
275
276        // Add main content sections
277        markdown.push_str(&self.convert_sections_with_figures(&article.sections, 1, figure_paths));
278
279        // Add references section
280        if !article.references.is_empty() {
281            markdown.push_str(&self.convert_references(&article.references));
282        }
283
284        // Add additional sections
285        markdown.push_str(&self.convert_additional_sections(article));
286
287        markdown.trim().to_string()
288    }
289
290    /// Convert a PMC article to Markdown
291    pub fn convert(&self, article: &PmcArticle) -> String {
292        let mut markdown = String::new();
293
294        // Add metadata section
295        if self.config.include_metadata {
296            markdown.push_str(&self.convert_metadata(article));
297            markdown.push_str("\n\n");
298        } else {
299            // Always include at least the title even when metadata is disabled
300            markdown.push_str(&self.format_heading(&self.clean_content(&article.title), 1));
301            markdown.push_str("\n\n");
302        }
303
304        // Add table of contents if requested
305        if self.config.include_toc {
306            markdown.push_str(&self.convert_toc(article));
307            markdown.push_str("\n\n");
308        }
309
310        // Add main content sections
311        markdown.push_str(&self.convert_sections(&article.sections, 1));
312
313        // Add references section
314        if !article.references.is_empty() {
315            markdown.push_str(&self.convert_references(&article.references));
316        }
317
318        // Add additional sections
319        markdown.push_str(&self.convert_additional_sections(article));
320
321        markdown.trim().to_string()
322    }
323
324    /// Generate YAML frontmatter from article metadata
325    fn generate_yaml_frontmatter(&self, article: &PmcArticle) -> String {
326        // Build metadata structure
327        let metadata = ArticleMetadata {
328            title: self.clean_content(&article.title),
329            authors: article
330                .authors
331                .iter()
332                .map(|a| self.clean_content(&a.full_name))
333                .collect(),
334            journal: self.clean_content(&article.journal.title),
335            journal_abbrev: article
336                .journal
337                .abbreviation
338                .as_ref()
339                .map(|a| self.clean_content(a)),
340            pub_date: format_first_pub_date(&article.pub_dates),
341            pmcid: article.pmcid.as_str(),
342            pmid: article.pmid.as_ref().map(|p| p.as_str()),
343            doi: article.doi.as_ref().map(|d| self.clean_content(d)),
344            article_type: article.article_type.as_ref().map(|t| self.clean_content(t)),
345            keywords: article
346                .keywords
347                .iter()
348                .map(|k| self.clean_content(k))
349                .collect(),
350            volume: article.volume.as_ref().map(|v| self.clean_content(v)),
351            issue: article.issue.as_ref().map(|i| self.clean_content(i)),
352            publisher: article
353                .journal
354                .publisher
355                .as_ref()
356                .map(|p| self.clean_content(p)),
357        };
358
359        // Serialize to YAML with proper formatting
360        match serde_yaml::to_string(&metadata) {
361            Ok(yaml_content) => format!("---\n{}---\n", yaml_content),
362            Err(e) => {
363                tracing::warn!("Failed to serialize YAML frontmatter: {}", e);
364                // Fallback to empty frontmatter
365                "---\n---\n".to_string()
366            }
367        }
368    }
369
370    /// Convert metadata section
371    fn convert_metadata(&self, article: &PmcArticle) -> String {
372        // Use YAML frontmatter if configured
373        if self.config.use_yaml_frontmatter {
374            return self.generate_yaml_frontmatter(article);
375        }
376
377        let mut metadata = String::new();
378
379        // Title
380        metadata.push_str(&self.format_heading(&self.clean_content(&article.title), 1));
381        metadata.push('\n');
382
383        // Authors
384        if !article.authors.is_empty() {
385            metadata.push_str("\n**Authors:** ");
386            metadata.push_str(&self.format_authors(&article.authors));
387            metadata.push('\n');
388        }
389
390        // Journal information
391        let journal_title = &article.journal.title;
392        metadata.push_str(&format!("\n**Journal:** {journal_title}"));
393        if let Some(abbrev) = &article.journal.abbreviation {
394            metadata.push_str(&format!(" ({abbrev})"));
395        }
396        metadata.push('\n');
397
398        // Publication date
399        if let Some(pub_date) = format_first_pub_date(&article.pub_dates) {
400            metadata.push_str(&format!("**Published:** {pub_date}\n"));
401        }
402
403        // Identifiers
404        let mut identifiers = Vec::new();
405        if let Some(doi) = &article.doi {
406            if self.config.include_identifier_links {
407                identifiers.push(format!("[DOI: {doi}](https://doi.org/{doi})"));
408            } else {
409                identifiers.push(format!("DOI: {doi}"));
410            }
411        }
412        if let Some(pmid) = &article.pmid {
413            let pmid_str = pmid.as_str();
414            if self.config.include_identifier_links {
415                identifiers.push(format!(
416                    "[PMID: {pmid_str}](https://pubmed.ncbi.nlm.nih.gov/{pmid_str})"
417                ));
418            } else {
419                identifiers.push(format!("PMID: {pmid_str}"));
420            }
421        }
422        let pmcid = article.pmcid.as_str();
423        identifiers.push(format!("PMC: {pmcid}"));
424
425        if !identifiers.is_empty() {
426            let identifiers_str = identifiers.join(" | ");
427            metadata.push_str(&format!("**Identifiers:** {identifiers_str}\n"));
428        }
429
430        // Article type
431        if let Some(article_type) = &article.article_type {
432            metadata.push_str(&format!("**Article Type:** {article_type}\n"));
433        }
434
435        // Keywords
436        if !article.keywords.is_empty() {
437            let clean_keywords: Vec<String> = article
438                .keywords
439                .iter()
440                .map(|k| self.clean_content(k))
441                .collect();
442            let keywords_str = clean_keywords.join(", ");
443            metadata.push_str(&format!("**Keywords:** {keywords_str}\n"));
444        }
445
446        // Journal details
447        let mut journal_details = Vec::new();
448        if let Some(volume) = &article.volume {
449            journal_details.push(format!("Volume {volume}"));
450        }
451        if let Some(issue) = &article.issue {
452            journal_details.push(format!("Issue {issue}"));
453        }
454        if let Some(publisher) = &article.journal.publisher {
455            journal_details.push(format!("Publisher: {publisher}"));
456        }
457        if !journal_details.is_empty() {
458            metadata.push_str(&format!(
459                "**Journal Details:** {}\n",
460                journal_details.join(" | ")
461            ));
462        }
463
464        metadata
465    }
466
467    /// Convert table of contents
468    fn convert_toc(&self, article: &PmcArticle) -> String {
469        let mut toc = String::new();
470        toc.push_str(&self.format_heading("Table of Contents", 2));
471        toc.push('\n');
472
473        for (i, section) in article.sections.iter().enumerate() {
474            let default_title = "Untitled".to_string();
475            let title = section.title.as_ref().unwrap_or(&default_title);
476            let anchor = self.create_anchor(title);
477            let index = i + 1;
478            toc.push_str(&format!("{index}. [{title}](#{anchor})\n"));
479
480            // Add subsections
481            for (j, subsection) in section.subsections.iter().enumerate() {
482                let default_sub_title = "Untitled".to_string();
483                let sub_title = subsection.title.as_ref().unwrap_or(&default_sub_title);
484                let sub_anchor = self.create_anchor(sub_title);
485                let main_index = i + 1;
486                let sub_index = j + 1;
487                toc.push_str(&format!(
488                    "   {main_index}.{sub_index}. [{sub_title}](#{sub_anchor})\n"
489                ));
490            }
491        }
492
493        toc
494    }
495
496    /// Convert article sections with figure paths
497    fn convert_sections_with_figures(
498        &self,
499        sections: &[Section],
500        level: u8,
501        figure_paths: Option<&HashMap<String, String>>,
502    ) -> String {
503        let mut content = String::new();
504
505        for section in sections {
506            // Section heading
507            if let Some(title) = &section.title {
508                content.push_str(&self.format_heading(title, level));
509                content.push_str("\n\n");
510            }
511
512            // Section content
513            if !section.content.is_empty() {
514                content.push_str(&self.clean_content(&section.content));
515                content.push_str("\n\n");
516            }
517
518            // Figures
519            if self.config.include_figure_captions {
520                for figure in &section.figures {
521                    let figure_path = figure_paths.and_then(|paths| paths.get(&figure.id));
522                    content.push_str(&self.convert_figure_with_path(figure, figure_path));
523                    content.push_str("\n\n");
524                }
525            }
526
527            // Tables
528            if self.config.include_figure_captions {
529                for table in &section.tables {
530                    content.push_str(&self.convert_table(table));
531                    content.push_str("\n\n");
532                }
533            }
534
535            // Subsections
536            if !section.subsections.is_empty() {
537                let next_level = (level + 1).min(self.config.max_heading_level);
538                content.push_str(&self.convert_sections_with_figures(
539                    &section.subsections,
540                    next_level,
541                    figure_paths,
542                ));
543            }
544        }
545
546        content
547    }
548
549    /// Convert article sections
550    fn convert_sections(&self, sections: &[Section], level: u8) -> String {
551        let mut content = String::new();
552
553        for section in sections {
554            // Section heading
555            if let Some(title) = &section.title {
556                content.push_str(&self.format_heading(title, level));
557                content.push_str("\n\n");
558            }
559
560            // Section content
561            if !section.content.is_empty() {
562                content.push_str(&self.clean_content(&section.content));
563                content.push_str("\n\n");
564            }
565
566            // Figures
567            if self.config.include_figure_captions {
568                for figure in &section.figures {
569                    content.push_str(&self.convert_figure(figure));
570                    content.push_str("\n\n");
571                }
572            }
573
574            // Tables
575            if self.config.include_figure_captions {
576                for table in &section.tables {
577                    content.push_str(&self.convert_table(table));
578                    content.push_str("\n\n");
579                }
580            }
581
582            // Subsections
583            if !section.subsections.is_empty() {
584                let next_level = (level + 1).min(self.config.max_heading_level);
585                content.push_str(&self.convert_sections(&section.subsections, next_level));
586            }
587        }
588
589        content
590    }
591
592    /// Convert references section
593    fn convert_references(&self, references: &[Reference]) -> String {
594        let mut content = String::new();
595        content.push_str(&self.format_heading("References", 2));
596        content.push_str("\n\n");
597
598        match self.config.reference_style {
599            ReferenceStyle::Numbered => {
600                for (i, reference) in references.iter().enumerate() {
601                    content.push_str(&format!(
602                        "{}. {}\n",
603                        i + 1,
604                        self.format_reference(reference)
605                    ));
606                }
607            }
608            ReferenceStyle::AuthorYear | ReferenceStyle::FullCitation => {
609                for reference in references {
610                    let formatted_ref = self.format_reference(reference);
611                    content.push_str(&format!("- {formatted_ref}\n"));
612                }
613            }
614        }
615
616        content.push('\n');
617        content
618    }
619
620    /// Convert additional sections (funding, conflicts, acknowledgments)
621    fn convert_additional_sections(&self, article: &PmcArticle) -> String {
622        let mut content = String::new();
623
624        // Funding
625        if !article.funding.is_empty() {
626            content.push_str(&self.format_heading("Funding", 2));
627            content.push_str("\n\n");
628            for funding in &article.funding {
629                content.push_str(&self.format_funding(funding));
630                content.push('\n');
631            }
632            content.push('\n');
633        }
634
635        // Conflict of interest
636        if let Some(coi) = &article.conflict_of_interest {
637            content.push_str(&self.format_heading("Conflict of Interest", 2));
638            content.push_str("\n\n");
639            content.push_str(&self.clean_content(coi));
640            content.push_str("\n\n");
641        }
642
643        // Acknowledgments
644        if let Some(ack) = &article.acknowledgments {
645            content.push_str(&self.format_heading("Acknowledgments", 2));
646            content.push_str("\n\n");
647            content.push_str(&self.clean_content(ack));
648            content.push_str("\n\n");
649        }
650
651        // Data availability
652        if let Some(data_avail) = &article.data_availability {
653            content.push_str(&self.format_heading("Data Availability", 2));
654            content.push_str("\n\n");
655            content.push_str(&self.clean_content(data_avail));
656            content.push_str("\n\n");
657        }
658
659        content
660    }
661
662    /// Format a heading based on the configured style
663    fn format_heading(&self, text: &str, level: u8) -> String {
664        let level = level.min(self.config.max_heading_level);
665
666        match self.config.heading_style {
667            HeadingStyle::ATX => {
668                let hashes = "#".repeat(level as usize);
669                format!("{hashes} {text}")
670            }
671            HeadingStyle::Setext => {
672                if level == 1 {
673                    let underline = "=".repeat(text.len());
674                    format!("{text}\n{underline}")
675                } else if level == 2 {
676                    let underline = "-".repeat(text.len());
677                    format!("{text}\n{underline}")
678                } else {
679                    // Fall back to ATX for levels 3+
680                    let hashes = "#".repeat(level as usize);
681                    format!("{hashes} {text}")
682                }
683            }
684        }
685    }
686
687    /// Format authors list (simplified)
688    fn format_authors(&self, authors: &[Author]) -> String {
689        authors
690            .iter()
691            .map(|author| {
692                let mut name = self.clean_content(&author.full_name);
693
694                // Add corresponding author indicator with *
695                if author.is_corresponding {
696                    name.push('*');
697                }
698
699                // Add simple ORCID link if available and enabled
700                if self.config.include_orcid_links
701                    && let Some(orcid) = &author.orcid
702                {
703                    // Clean the ORCID string first to remove any XML tags
704                    let cleaned_orcid = self.clean_content(orcid);
705                    let clean_orcid = cleaned_orcid.trim_start_matches("https://orcid.org/");
706
707                    // Basic ORCID format validation (should be like 0000-0000-0000-0000)
708                    if clean_orcid.len() >= 19 && clean_orcid.matches('-').count() == 3 {
709                        name.push_str(&format!(" ([ORCID](https://orcid.org/{clean_orcid}))"));
710                    }
711                }
712
713                name
714            })
715            .collect::<Vec<String>>()
716            .join(", ")
717    }
718
719    /// Format a single reference
720    fn format_reference(&self, reference: &Reference) -> String {
721        match self.config.reference_style {
722            ReferenceStyle::Numbered | ReferenceStyle::FullCitation => {
723                let citation = reference.format_citation();
724
725                if self.config.include_identifier_links {
726                    let mut formatted = citation;
727
728                    // Add DOI link
729                    if let Some(doi) = &reference.doi {
730                        formatted.push_str(&format!(" [DOI](https://doi.org/{doi})"));
731                    }
732
733                    // Add PMID link
734                    if let Some(pmid) = &reference.pmid {
735                        formatted
736                            .push_str(&format!(" [PMID](https://pubmed.ncbi.nlm.nih.gov/{pmid})"));
737                    }
738
739                    formatted
740                } else {
741                    citation
742                }
743            }
744            ReferenceStyle::AuthorYear => {
745                if let (Some(first_author), Some(year)) =
746                    (reference.authors.first(), reference.year.as_ref())
747                {
748                    format!("{} ({})", first_author.full_name, year)
749                } else {
750                    reference.format_citation()
751                }
752            }
753        }
754    }
755
756    /// Format funding information
757    fn format_funding(&self, funding: &FundingInfo) -> String {
758        let source = &funding.source;
759        let mut text = format!("- **{source}**");
760
761        if let Some(award_id) = &funding.award_id {
762            text.push_str(&format!(" (Award ID: {award_id})"));
763        }
764
765        if let Some(statement) = &funding.statement {
766            let content = self.clean_content(statement);
767            text.push_str(&format!(": {content}"));
768        }
769
770        text
771    }
772
773    /// Convert figure to markdown with optional path
774    fn convert_figure_with_path(&self, figure: &Figure, figure_path: Option<&String>) -> String {
775        let mut content = String::new();
776
777        // Add image if path is provided and include_local_figures is enabled
778        if self.config.include_local_figures
779            && let Some(path) = figure_path
780        {
781            let alt_text = figure
782                .alt_text
783                .as_deref()
784                .or(figure.label.as_deref())
785                .unwrap_or(&figure.id);
786            content.push_str(&format!("![{alt_text}]({path})\n\n"));
787        }
788
789        if let Some(label) = &figure.label {
790            content.push_str(&format!("**{label}**"));
791        } else {
792            let figure_id = &figure.id;
793            content.push_str(&format!("**Figure {figure_id}**"));
794        }
795
796        let caption = self.clean_content(&figure.caption);
797        content.push_str(&format!(": {caption}"));
798
799        if let Some(alt_text) = &figure.alt_text {
800            let alt_content = self.clean_content(alt_text);
801            content.push_str(&format!("\n\n*Alt text: {alt_content}*"));
802        }
803
804        content
805    }
806
807    /// Convert figure to markdown
808    fn convert_figure(&self, figure: &Figure) -> String {
809        let mut content = String::new();
810
811        if let Some(label) = &figure.label {
812            content.push_str(&format!("**{label}**"));
813        } else {
814            let figure_id = &figure.id;
815            content.push_str(&format!("**Figure {figure_id}**"));
816        }
817
818        let caption = self.clean_content(&figure.caption);
819        content.push_str(&format!(": {caption}"));
820
821        if let Some(alt_text) = &figure.alt_text {
822            let alt_content = self.clean_content(alt_text);
823            content.push_str(&format!("\n\n*Alt text: {alt_content}*"));
824        }
825
826        content
827    }
828
829    /// Convert table to markdown
830    fn convert_table(&self, table: &Table) -> String {
831        let mut content = String::new();
832
833        if let Some(label) = &table.label {
834            content.push_str(&format!("**{label}**"));
835        } else {
836            let table_id = &table.id;
837            content.push_str(&format!("**Table {table_id}**"));
838        }
839
840        let caption = self.clean_content(&table.caption);
841        content.push_str(&format!(": {caption}"));
842
843        if !table.footnotes.is_empty() {
844            content.push_str("\n\n*Footnotes:*\n");
845            for (i, footnote) in table.footnotes.iter().enumerate() {
846                let index = i + 1;
847                let footnote_content = self.clean_content(footnote);
848                content.push_str(&format!("{index}. {footnote_content}\n"));
849            }
850        }
851
852        content
853    }
854
855    /// Clean content by removing XML tags and fixing formatting
856    fn clean_content(&self, content: &str) -> String {
857        // Remove XML tags but preserve content
858        let mut cleaned = content.to_string();
859
860        // Remove common XML tags while preserving content
861        cleaned = regex::Regex::new(r"<[^>]*>")
862            .unwrap()
863            .replace_all(&cleaned, "")
864            .to_string();
865
866        // Fix HTML entities using the predefined table
867        for (entity, replacement) in HTML_ENTITIES {
868            cleaned = cleaned.replace(entity, replacement);
869        }
870
871        // Normalize whitespace
872        cleaned = regex::Regex::new(r"\s+")
873            .unwrap()
874            .replace_all(&cleaned, " ")
875            .trim()
876            .to_string();
877
878        cleaned
879    }
880
881    /// Create URL-safe anchor from title
882    fn create_anchor(&self, title: &str) -> String {
883        title
884            .to_lowercase()
885            .chars()
886            .map(|c| if c.is_alphanumeric() { c } else { '-' })
887            .collect::<String>()
888            .split('-')
889            .filter(|s| !s.is_empty())
890            .collect::<Vec<_>>()
891            .join("-")
892    }
893}
894
895impl Default for PmcMarkdownConverter {
896    fn default() -> Self {
897        Self::new()
898    }
899}
900
901#[cfg(test)]
902mod tests {
903    use super::*;
904    use pubmed_parser::common::{Author, PmcId, PubMedId, PublicationDate};
905    use pubmed_parser::pmc::{JournalMeta, PmcArticle};
906
907    /// Create a minimal test article with common defaults.
908    fn test_article(title: &str, pmcid: &str) -> PmcArticle {
909        PmcArticle {
910            pmcid: PmcId::parse(pmcid).unwrap(),
911            pmid: None,
912            doi: None,
913            article_type: None,
914            categories: vec![],
915            title: title.to_string(),
916            subtitle: None,
917            authors: vec![],
918            journal: JournalMeta {
919                title: "Test Journal".to_string(),
920                abbreviation: None,
921                issn_print: None,
922                issn_electronic: None,
923                publisher: None,
924            },
925            pub_dates: vec![],
926            volume: None,
927            issue: None,
928            fpage: None,
929            lpage: None,
930            elocation_id: None,
931            abstract_text: None,
932            abstract_sections: vec![],
933            keywords: vec![],
934            sections: vec![],
935            references: vec![],
936            funding: vec![],
937            acknowledgments: None,
938            conflict_of_interest: None,
939            data_availability: None,
940            supplementary_materials: vec![],
941            appendices: vec![],
942            glossary: vec![],
943            copyright: None,
944            license: None,
945            license_url: None,
946            history_dates: vec![],
947        }
948    }
949
950    #[test]
951    fn test_markdown_converter_creation() {
952        let converter = PmcMarkdownConverter::new();
953        assert!(converter.config.include_metadata);
954        assert_eq!(converter.config.heading_style, HeadingStyle::ATX);
955        assert_eq!(converter.config.reference_style, ReferenceStyle::Numbered);
956    }
957
958    #[test]
959    fn test_configuration_builder() {
960        let converter = PmcMarkdownConverter::new()
961            .with_include_metadata(false)
962            .with_heading_style(HeadingStyle::Setext)
963            .with_reference_style(ReferenceStyle::AuthorYear)
964            .with_max_heading_level(4);
965
966        assert!(!converter.config.include_metadata);
967        assert_eq!(converter.config.heading_style, HeadingStyle::Setext);
968        assert_eq!(converter.config.reference_style, ReferenceStyle::AuthorYear);
969        assert_eq!(converter.config.max_heading_level, 4);
970    }
971
972    #[test]
973    fn test_heading_formatting() {
974        let converter = PmcMarkdownConverter::new();
975
976        // ATX style
977        assert_eq!(converter.format_heading("Title", 1), "# Title");
978        assert_eq!(converter.format_heading("Subtitle", 2), "## Subtitle");
979
980        // Setext style
981        let converter = converter.with_heading_style(HeadingStyle::Setext);
982        assert_eq!(converter.format_heading("Title", 1), "Title\n=====");
983        assert_eq!(
984            converter.format_heading("Subtitle", 2),
985            "Subtitle\n--------"
986        );
987        assert_eq!(converter.format_heading("Section", 3), "### Section");
988    }
989
990    #[test]
991    fn test_clean_content() {
992        let converter = PmcMarkdownConverter::new();
993
994        let dirty = "<p>This is <em>emphasis</em> and &amp; entities</p>";
995        let clean = converter.clean_content(dirty);
996        assert_eq!(clean, "This is emphasis and & entities");
997    }
998
999    #[test]
1000    fn test_anchor_creation() {
1001        let converter = PmcMarkdownConverter::new();
1002
1003        assert_eq!(converter.create_anchor("Introduction"), "introduction");
1004        assert_eq!(
1005            converter.create_anchor("Methods & Results"),
1006            "methods-results"
1007        );
1008        assert_eq!(
1009            converter.create_anchor("Discussion (2023)"),
1010            "discussion-2023"
1011        );
1012    }
1013
1014    #[test]
1015    fn test_basic_conversion() {
1016        let converter = PmcMarkdownConverter::new();
1017
1018        let mut article = test_article("Test Article", "PMC1234567");
1019        article.pmid = Some(PubMedId::parse("12345").unwrap());
1020        article.authors = vec![Author::from_full_name("John Doe".to_string())];
1021        article.pub_dates = vec![PublicationDate {
1022            pub_type: None,
1023            year: Some(2023),
1024            month: None,
1025            day: None,
1026        }];
1027        article.doi = Some("10.1000/test".to_string());
1028        article.article_type = Some("research-article".to_string());
1029        article.keywords = vec!["test".to_string(), "example".to_string()];
1030
1031        let markdown = converter.convert(&article);
1032        assert!(markdown.contains("# Test Article"));
1033        assert!(markdown.contains("**Authors:** John Doe"));
1034        assert!(markdown.contains("**Journal:** Test Journal"));
1035        assert!(markdown.contains("DOI: 10.1000/test"));
1036        assert!(markdown.contains("**Keywords:** test, example"));
1037    }
1038
1039    #[test]
1040    fn test_yaml_frontmatter_basic() {
1041        let converter = PmcMarkdownConverter::new().with_yaml_frontmatter(true);
1042
1043        let mut article = test_article("Test Article", "PMC1234567");
1044        article.pmid = Some(PubMedId::parse("12345").unwrap());
1045        article.authors = vec![
1046            Author::from_full_name("John Doe".to_string()),
1047            Author::from_full_name("Jane Smith".to_string()),
1048        ];
1049        article.pub_dates = vec![PublicationDate {
1050            pub_type: None,
1051            year: Some(2023),
1052            month: Some(5),
1053            day: Some(15),
1054        }];
1055        article.doi = Some("10.1000/test".to_string());
1056        article.article_type = Some("research-article".to_string());
1057        article.keywords = vec!["test".to_string(), "example".to_string()];
1058
1059        let markdown = converter.convert(&article);
1060
1061        // Check frontmatter delimiters
1062        assert!(markdown.starts_with("---\n"));
1063        let delimiter_count = markdown.matches("---").count();
1064        assert_eq!(
1065            delimiter_count, 2,
1066            "Should have opening and closing YAML frontmatter delimiters"
1067        );
1068
1069        // Check basic fields (serde_yaml format)
1070        assert!(markdown.contains("title: Test Article"));
1071        assert!(markdown.contains("authors:"));
1072        assert!(markdown.contains("- John Doe"));
1073        assert!(markdown.contains("- Jane Smith"));
1074        assert!(markdown.contains("journal: Test Journal"));
1075        assert!(
1076            markdown.contains("pub_date: '2023-05-15'")
1077                || markdown.contains("pub_date: 2023-05-15")
1078        );
1079        assert!(markdown.contains("pmcid: PMC1234567"));
1080        assert!(markdown.contains("pmid: '12345'"));
1081        assert!(markdown.contains("doi: 10.1000/test"));
1082        assert!(markdown.contains("article_type: research-article"));
1083        assert!(markdown.contains("keywords:"));
1084        assert!(markdown.contains("- test"));
1085        assert!(markdown.contains("- example"));
1086    }
1087
1088    #[test]
1089    fn test_yaml_frontmatter_with_special_characters() {
1090        let converter = PmcMarkdownConverter::new().with_yaml_frontmatter(true);
1091
1092        let mut article = test_article("COVID-19: A Comprehensive Study", "PMC7890123");
1093        article.journal.title = "Nature: Medicine & Science".to_string();
1094        article.authors = vec![Author::from_full_name("O'Brien, Michael".to_string())];
1095        article.pub_dates = vec![PublicationDate {
1096            pub_type: None,
1097            year: Some(2023),
1098            month: None,
1099            day: None,
1100        }];
1101        article.doi = Some("10.1038/s41591-023-01234-5".to_string());
1102        article.article_type = Some("research-article".to_string());
1103        article.keywords = vec![
1104            "#COVID-19".to_string(),
1105            "SARS-CoV-2".to_string(),
1106            "vaccine".to_string(),
1107        ];
1108
1109        let markdown = converter.convert(&article);
1110
1111        assert!(
1112            markdown.contains("title: 'COVID-19: A Comprehensive Study'")
1113                || markdown.contains("title: \"COVID-19: A Comprehensive Study\"")
1114        );
1115        assert!(
1116            markdown.contains("journal: 'Nature: Medicine & Science'")
1117                || markdown.contains("journal: \"Nature: Medicine & Science\"")
1118        );
1119        assert!(markdown.contains("'#COVID-19'") || markdown.contains("\"#COVID-19\""));
1120        assert!(markdown.contains("SARS-CoV-2"));
1121    }
1122
1123    #[test]
1124    fn test_yaml_frontmatter_backward_compatibility() {
1125        // Default should NOT use YAML frontmatter
1126        let converter = PmcMarkdownConverter::new();
1127        assert!(!converter.config.use_yaml_frontmatter);
1128
1129        let article = test_article("Test Article", "PMC1234567");
1130
1131        let markdown = converter.convert(&article);
1132
1133        // Should use old format with markdown heading
1134        assert!(markdown.contains("# Test Article"));
1135        assert!(markdown.contains("**Journal:** Test Journal"));
1136        // Should NOT have YAML frontmatter delimiters
1137        assert!(!markdown.starts_with("---\n"));
1138    }
1139
1140    #[test]
1141    fn test_builder_pattern_with_yaml_frontmatter() {
1142        let converter = PmcMarkdownConverter::new()
1143            .with_yaml_frontmatter(true)
1144            .with_include_metadata(true)
1145            .with_heading_style(HeadingStyle::ATX);
1146
1147        assert!(converter.config.use_yaml_frontmatter);
1148        assert!(converter.config.include_metadata);
1149        assert_eq!(converter.config.heading_style, HeadingStyle::ATX);
1150    }
1151}