pubmed_parser/common/
xml_utils.rs

1//! Common XML parsing utilities shared between PubMed and PMC parsers
2//!
3//! This module provides reusable XML parsing functions for both string-based
4//! and serde-based XML parsing workflows.
5
6use std::borrow::Cow;
7use std::collections::HashMap;
8use tracing::debug;
9
10/// Strip inline HTML-like formatting tags from XML content
11///
12/// Handles tags like `<i>`, `<sup>`, `<sub>`, `<b>`, `<u>` that can appear in AbstractText and ArticleTitle.
13/// These tags cause parsing issues with quick-xml's serde deserializer.
14///
15/// This function is used by both PubMed and PMC parsers to clean XML before parsing.
16///
17/// # Arguments
18///
19/// * `xml` - The XML string to clean
20///
21/// # Returns
22///
23/// A cleaned XML string with inline HTML tags removed
24///
25/// # Example
26///
27/// ```ignore
28/// use pubmed_parser::common::xml_utils::strip_inline_html_tags;
29///
30/// let xml = "<AbstractText>CO<sup>2</sup> levels</AbstractText>";
31/// let cleaned = strip_inline_html_tags(xml);
32/// assert_eq!(cleaned, "<AbstractText>CO2 levels</AbstractText>");
33/// ```
34pub fn strip_inline_html_tags(xml: &str) -> Cow<'_, str> {
35    use regex::Regex;
36    use std::sync::OnceLock;
37
38    // Regex pattern to match inline HTML tags (both opening and closing)
39    // Matches: <i>, </i>, <b>, </b>, <sup>, </sup>, <sub>, </sub>, <u>, </u>, <em>, </em>, <strong>, </strong>
40    static INLINE_TAG_REGEX: OnceLock<Regex> = OnceLock::new();
41    let re = INLINE_TAG_REGEX.get_or_init(|| {
42        Regex::new(r"</?(?:i|b|u|sup|sub|em|strong|italic|bold)>")
43            .expect("Failed to compile inline tag regex")
44    });
45
46    let cleaned = re.replace_all(xml, "");
47
48    // Log if any tags were stripped
49    if let Cow::Owned(ref _s) = cleaned {
50        debug!(
51            "Stripped inline HTML tags: original {} bytes -> cleaned {} bytes (removed {} bytes)",
52            xml.len(),
53            cleaned.len(),
54            xml.len() - cleaned.len()
55        );
56    }
57
58    cleaned
59}
60
61/// Extract text between two XML tags
62///
63/// Finds the first occurrence of text between start and end tags.
64///
65/// # Arguments
66///
67/// * `content` - The XML content to search
68/// * `start` - The opening tag (e.g., "<title>")
69/// * `end` - The closing tag (e.g., "</title>")
70///
71/// # Returns
72///
73/// Some(String) with the text between tags, or None if not found
74pub fn extract_text_between(content: &str, start: &str, end: &str) -> Option<String> {
75    extract_text_between_ref(content, start, end).map(|s| s.to_string())
76}
77
78/// Extract text between two XML tags as a borrowed string slice
79///
80/// Same as [`extract_text_between`] but returns a `&str` slice to avoid allocation.
81pub fn extract_text_between_ref<'a>(content: &'a str, start: &str, end: &str) -> Option<&'a str> {
82    let start_pos = content.find(start)? + start.len();
83    let end_pos = content[start_pos..].find(end)? + start_pos;
84    Some(content[start_pos..end_pos].trim())
85}
86
87/// Extract attribute value from XML tag
88///
89/// # Arguments
90///
91/// * `content` - The XML tag content
92/// * `attribute` - The attribute name to extract
93///
94/// # Returns
95///
96/// Some(String) with the attribute value, or None if not found
97pub fn extract_attribute_value(content: &str, attribute: &str) -> Option<String> {
98    let pattern = format!("{attribute}=\"");
99    if let Some(attr_start) = content.find(&pattern) {
100        let value_start = attr_start + pattern.len();
101        if let Some(value_end) = content[value_start..].find('"') {
102            return Some(content[value_start..value_start + value_end].to_string());
103        }
104    }
105    None
106}
107
108/// Strip XML tags from content
109///
110/// Removes all XML tags, leaving only text content.
111///
112/// # Arguments
113///
114/// * `content` - The XML content to strip
115///
116/// # Returns
117///
118/// A string with all XML tags removed
119pub fn strip_xml_tags(content: &str) -> String {
120    let bytes = content.as_bytes();
121    let mut result = Vec::with_capacity(bytes.len());
122    let mut in_tag = false;
123
124    for &b in bytes {
125        match b {
126            b'<' => in_tag = true,
127            b'>' => in_tag = false,
128            _ if !in_tag => result.push(b),
129            _ => {}
130        }
131    }
132
133    // SAFETY: Input is valid UTF-8 and we only remove complete XML tags
134    // (ASCII byte sequences between '<' and '>'). Since '<' and '>' are single-byte
135    // ASCII and never appear as UTF-8 continuation bytes, this preserves valid UTF-8.
136    let s = unsafe { String::from_utf8_unchecked(result) };
137
138    // Trim in-place without re-allocating
139    let trimmed = s.trim();
140    if trimmed.len() == s.len() {
141        s
142    } else {
143        trimmed.to_string()
144    }
145}
146
147/// Find all occurrences of a tag in content
148///
149/// Returns a vector of strings, each containing a complete tag with its content.
150///
151/// # Arguments
152///
153/// * `content` - The XML content to search
154/// * `tag` - The tag name to find (e.g., "p" for <p>...</p>)
155///
156/// # Returns
157///
158/// A vector of strings containing all found tags
159pub fn find_all_tags(content: &str, tag: &str) -> Vec<String> {
160    let mut results = Vec::new();
161    let start_tag = format!("<{}", tag);
162    let end_tag = format!("</{}>", tag);
163
164    let mut pos = 0;
165    while let Some(start_pos) = content[pos..].find(&start_tag) {
166        let start_pos = pos + start_pos;
167
168        // Find the end of the opening tag
169        if let Some(tag_end) = content[start_pos..].find(">") {
170            let tag_end = start_pos + tag_end + 1;
171
172            // Find the closing tag
173            if let Some(end_pos) = content[tag_end..].find(&end_tag) {
174                let end_pos = tag_end + end_pos;
175                let tag_content = content[start_pos..end_pos + end_tag.len()].to_string();
176                results.push(tag_content);
177                pos = end_pos;
178            } else {
179                break;
180            }
181        } else {
182            break;
183        }
184    }
185
186    results
187}
188
189/// Extract content between tags for all occurrences
190///
191/// Returns a vector of strings containing the text between each occurrence of the tags.
192///
193/// # Arguments
194///
195/// * `content` - The XML content to search
196/// * `start` - The opening tag (e.g., "<p>")
197/// * `end` - The closing tag (e.g., "</p>")
198///
199/// # Returns
200///
201/// A vector of strings containing all found text between tags
202pub fn extract_all_text_between(content: &str, start: &str, end: &str) -> Vec<String> {
203    let mut results = Vec::new();
204    let mut pos = 0;
205
206    while let Some(start_pos) = content[pos..].find(start) {
207        let start_pos = pos + start_pos + start.len();
208        if let Some(end_pos) = content[start_pos..].find(end) {
209            let end_pos = start_pos + end_pos;
210            let text = content[start_pos..end_pos].trim().to_string();
211            if !text.is_empty() {
212                results.push(text);
213            }
214            pos = end_pos;
215        } else {
216            break;
217        }
218    }
219
220    results
221}
222
223/// Extract element content with its tag name
224///
225/// Finds the first occurrence of an XML element and returns its inner content.
226///
227/// # Arguments
228///
229/// * `content` - The XML content to search
230/// * `tag` - The tag name (e.g., "section")
231///
232/// # Returns
233///
234/// Some(String) with the element's inner content, or None if not found
235pub fn extract_element_content(content: &str, tag: &str) -> Option<String> {
236    let start_tag = format!("<{}", tag);
237    let end_tag = format!("</{}>", tag);
238
239    if let Some(start_pos) = content.find(&start_tag)
240        && let Some(tag_end) = content[start_pos..].find(">")
241    {
242        let content_start = start_pos + tag_end + 1;
243        if let Some(end_pos) = content[content_start..].find(&end_tag) {
244            let content_end = content_start + end_pos;
245            return Some(content[content_start..content_end].to_string());
246        }
247    }
248
249    None
250}
251
252/// Extract all attributes from an XML tag
253///
254/// Parses an XML tag and returns a HashMap of attribute names to values.
255///
256/// # Arguments
257///
258/// * `tag` - The XML tag to parse (e.g., "<element id=\"test\" class=\"foo\">")
259///
260/// # Returns
261///
262/// A HashMap containing all attribute name-value pairs
263pub fn extract_all_attributes(tag: &str) -> HashMap<String, String> {
264    let mut attributes = HashMap::new();
265
266    // Find the opening tag
267    if let Some(start) = tag.find('<')
268        && let Some(end) = tag[start..].find('>')
269    {
270        let tag_content = &tag[start + 1..start + end];
271
272        // Skip the tag name
273        if let Some(space_pos) = tag_content.find(' ') {
274            let attrs_part = &tag_content[space_pos + 1..];
275            let bytes = attrs_part.as_bytes();
276            let len = bytes.len();
277
278            // Parse attributes using byte-level operations (O(n) instead of O(n²))
279            let mut pos = 0;
280            while pos < len {
281                // Skip whitespace
282                while pos < len && bytes[pos].is_ascii_whitespace() {
283                    pos += 1;
284                }
285
286                if pos >= len {
287                    break;
288                }
289
290                // Find attribute name
291                let name_start = pos;
292                while pos < len && bytes[pos] != b'=' && !bytes[pos].is_ascii_whitespace() {
293                    pos += 1;
294                }
295
296                if pos >= len {
297                    break;
298                }
299
300                let attr_name = &attrs_part[name_start..pos];
301
302                // Skip whitespace and '='
303                while pos < len {
304                    if bytes[pos] == b'=' {
305                        pos += 1;
306                        break;
307                    } else if bytes[pos].is_ascii_whitespace() {
308                        pos += 1;
309                    } else {
310                        break;
311                    }
312                }
313
314                // Skip whitespace after '='
315                while pos < len && bytes[pos].is_ascii_whitespace() {
316                    pos += 1;
317                }
318
319                if pos >= len {
320                    break;
321                }
322
323                // Extract quoted value
324                let quote_byte = bytes[pos];
325                if quote_byte == b'"' || quote_byte == b'\'' {
326                    pos += 1; // Skip opening quote
327                    let value_start = pos;
328                    while pos < len && bytes[pos] != quote_byte {
329                        pos += 1;
330                    }
331                    if pos < len {
332                        attributes.insert(
333                            attr_name.to_string(),
334                            attrs_part[value_start..pos].to_string(),
335                        );
336                        pos += 1; // Skip closing quote
337                    }
338                }
339            }
340        }
341    }
342
343    attributes
344}
345
346/// Decode XML character entities in a string
347///
348/// Decodes both named entities (`&amp;`, `&lt;`, `&gt;`, `&quot;`, `&apos;`)
349/// and numeric entities (`&#169;`, `&#x00A9;`).
350pub fn decode_xml_entities(content: &str) -> Cow<'_, str> {
351    if !content.contains('&') {
352        return Cow::Borrowed(content);
353    }
354
355    let mut result = String::with_capacity(content.len());
356    let mut chars = content.chars().peekable();
357
358    while let Some(c) = chars.next() {
359        if c == '&' {
360            // Collect entity
361            let mut entity = String::new();
362            let mut found_semicolon = false;
363            for ec in chars.by_ref() {
364                if ec == ';' {
365                    found_semicolon = true;
366                    break;
367                }
368                entity.push(ec);
369                if entity.len() > 10 {
370                    break;
371                }
372            }
373
374            if found_semicolon {
375                match entity.as_str() {
376                    "amp" => result.push('&'),
377                    "lt" => result.push('<'),
378                    "gt" => result.push('>'),
379                    "quot" => result.push('"'),
380                    "apos" => result.push('\''),
381                    s if s.starts_with('#') => {
382                        let code = if s.starts_with("#x") || s.starts_with("#X") {
383                            u32::from_str_radix(&s[2..], 16).ok()
384                        } else {
385                            s[1..].parse::<u32>().ok()
386                        };
387                        if let Some(ch) = code.and_then(char::from_u32) {
388                            result.push(ch);
389                        } else {
390                            // Unknown numeric entity - preserve as-is
391                            result.push('&');
392                            result.push_str(&entity);
393                            result.push(';');
394                        }
395                    }
396                    _ => {
397                        // Unknown named entity - preserve as-is
398                        result.push('&');
399                        result.push_str(&entity);
400                        result.push(';');
401                    }
402                }
403            } else {
404                // Malformed entity (no semicolon found) - preserve as-is
405                result.push('&');
406                result.push_str(&entity);
407            }
408        } else {
409            result.push(c);
410        }
411    }
412
413    Cow::Owned(result)
414}
415
416/// Check if a tag is self-closing
417///
418/// # Arguments
419///
420/// * `tag` - The XML tag to check
421///
422/// # Returns
423///
424/// true if the tag is self-closing (ends with "/>"), false otherwise
425pub fn is_self_closing_tag(tag: &str) -> bool {
426    tag.trim_end().ends_with("/>")
427}
428
429/// Extract text content from a section, handling nested tags
430///
431/// Combines element extraction and tag stripping for convenience.
432///
433/// # Arguments
434///
435/// * `content` - The XML content to search
436/// * `section_tag` - The section tag name
437///
438/// # Returns
439///
440/// Some(String) with the section text (tags removed), or None if not found
441pub fn extract_section_text(content: &str, section_tag: &str) -> Option<String> {
442    extract_element_content(content, section_tag)
443        .map(|section_content| strip_xml_tags(&section_content))
444}
445
446#[cfg(test)]
447mod tests {
448    use super::*;
449
450    #[test]
451    fn test_strip_inline_html_tags() {
452        // Test stripping <sup> tags
453        let xml_with_sup = r#"<AbstractText>CO<sup>2</sup> levels</AbstractText>"#;
454        let cleaned = strip_inline_html_tags(xml_with_sup);
455        assert!(
456            !cleaned.contains("<sup>"),
457            "Cleaned XML still contains <sup>: {}",
458            cleaned
459        );
460        assert!(
461            !cleaned.contains("</sup>"),
462            "Cleaned XML still contains </sup>: {}",
463            cleaned
464        );
465        assert!(cleaned.contains("CO2 levels"));
466
467        // Test stripping <i> tags
468        let xml_with_i = r#"<AbstractText>The <i>e.g.</i> example</AbstractText>"#;
469        let cleaned = strip_inline_html_tags(xml_with_i);
470        assert!(!cleaned.contains("<i>"));
471        assert!(!cleaned.contains("</i>"));
472        assert!(cleaned.contains("e.g."));
473
474        // Test stripping <sub> tags
475        let xml_with_sub = r#"<AbstractText>H<sub>2</sub>O</AbstractText>"#;
476        let cleaned = strip_inline_html_tags(xml_with_sub);
477        assert!(!cleaned.contains("<sub>"));
478        assert!(!cleaned.contains("</sub>"));
479        assert!(cleaned.contains("H2O"));
480
481        // Test preserving other tags
482        let xml_with_mixed = r#"<Article><Title>CO<sup>2</sup> Study</Title></Article>"#;
483        let cleaned = strip_inline_html_tags(xml_with_mixed);
484        assert!(cleaned.contains("<Article>"));
485        assert!(cleaned.contains("</Article>"));
486        assert!(cleaned.contains("<Title>"));
487        assert!(!cleaned.contains("<sup>"));
488    }
489
490    #[test]
491    fn test_extract_text_between() {
492        let content = "<title>Test Title</title>";
493        let result = extract_text_between(content, "<title>", "</title>");
494        assert_eq!(result, Some("Test Title".to_string()));
495    }
496
497    #[test]
498    fn test_extract_attribute_value() {
499        let content = r#"<element id="test-id" class="test-class">"#;
500        let result = extract_attribute_value(content, "id");
501        assert_eq!(result, Some("test-id".to_string()));
502    }
503
504    #[test]
505    fn test_strip_xml_tags() {
506        let content = "<p>This is <b>bold</b> text</p>";
507        let result = strip_xml_tags(content);
508        assert_eq!(result, "This is bold text");
509    }
510
511    #[test]
512    fn test_find_all_tags() {
513        let content = "<p>First paragraph</p><p>Second paragraph</p>";
514        let results = find_all_tags(content, "p");
515        assert_eq!(results.len(), 2);
516        assert_eq!(results[0], "<p>First paragraph</p>");
517        assert_eq!(results[1], "<p>Second paragraph</p>");
518    }
519
520    #[test]
521    fn test_extract_all_text_between() {
522        let content = "<p>First</p><p>Second</p><p>Third</p>";
523        let results = extract_all_text_between(content, "<p>", "</p>");
524        assert_eq!(results, vec!["First", "Second", "Third"]);
525    }
526
527    #[test]
528    fn test_extract_element_content() {
529        let content = "<section><title>Test</title><p>Content</p></section>";
530        let result = extract_element_content(content, "section");
531        assert_eq!(
532            result,
533            Some("<title>Test</title><p>Content</p>".to_string())
534        );
535    }
536
537    #[test]
538    fn test_is_self_closing_tag() {
539        assert!(is_self_closing_tag("<img src=\"test.jpg\"/>"));
540        assert!(!is_self_closing_tag("<img src=\"test.jpg\">"));
541    }
542
543    #[test]
544    fn test_extract_all_attributes() {
545        let tag = r#"<element id="test-id" class="test-class" data-value="123">"#;
546        let attributes = extract_all_attributes(tag);
547
548        assert_eq!(attributes.get("id"), Some(&"test-id".to_string()));
549        assert_eq!(attributes.get("class"), Some(&"test-class".to_string()));
550        assert_eq!(attributes.get("data-value"), Some(&"123".to_string()));
551    }
552
553    #[test]
554    fn test_extract_section_text() {
555        let content = "<section><title>Test</title><p>Content</p></section>";
556        let result = extract_section_text(content, "section");
557        // Note: strip_xml_tags removes tags but doesn't add spaces between elements
558        assert_eq!(result, Some("TestContent".to_string()));
559    }
560
561    #[test]
562    fn test_decode_xml_entities() {
563        // Named entities
564        assert_eq!(decode_xml_entities("&amp;"), "&");
565        assert_eq!(decode_xml_entities("&lt;"), "<");
566        assert_eq!(decode_xml_entities("&gt;"), ">");
567        assert_eq!(decode_xml_entities("&quot;"), "\"");
568        assert_eq!(decode_xml_entities("&apos;"), "'");
569
570        // Numeric entities (decimal)
571        assert_eq!(decode_xml_entities("&#169;"), "©");
572        assert_eq!(decode_xml_entities("&#231;"), "ç");
573        assert_eq!(decode_xml_entities("&#193;"), "Á");
574
575        // Numeric entities (hexadecimal)
576        assert_eq!(decode_xml_entities("&#xA9;"), "©");
577        assert_eq!(decode_xml_entities("&#x00A9;"), "©");
578
579        // No entities — borrowed (no allocation)
580        let result = decode_xml_entities("no entities here");
581        assert!(matches!(result, Cow::Borrowed(_)));
582
583        // Mixed content
584        assert_eq!(
585            decode_xml_entities("&#169; 2021 Fran&#231;ois &amp; Co"),
586            "© 2021 François & Co"
587        );
588    }
589}
pubmed_parser/common/xml_utils.rs

pubmed_parser/common/
xml_utils.rs