pubmed_parser/common/
xml_utils.rs1use std::borrow::Cow;
7use std::collections::HashMap;
8use tracing::debug;
9
10pub fn strip_inline_html_tags(xml: &str) -> Cow<'_, str> {
35 use regex::Regex;
36 use std::sync::OnceLock;
37
38 static INLINE_TAG_REGEX: OnceLock<Regex> = OnceLock::new();
41 let re = INLINE_TAG_REGEX.get_or_init(|| {
42 Regex::new(r"</?(?:i|b|u|sup|sub|em|strong|italic|bold)>")
43 .expect("Failed to compile inline tag regex")
44 });
45
46 let cleaned = re.replace_all(xml, "");
47
48 if let Cow::Owned(ref _s) = cleaned {
50 debug!(
51 "Stripped inline HTML tags: original {} bytes -> cleaned {} bytes (removed {} bytes)",
52 xml.len(),
53 cleaned.len(),
54 xml.len() - cleaned.len()
55 );
56 }
57
58 cleaned
59}
60
61pub fn extract_text_between(content: &str, start: &str, end: &str) -> Option<String> {
75 extract_text_between_ref(content, start, end).map(|s| s.to_string())
76}
77
78pub fn extract_text_between_ref<'a>(content: &'a str, start: &str, end: &str) -> Option<&'a str> {
82 let start_pos = content.find(start)? + start.len();
83 let end_pos = content[start_pos..].find(end)? + start_pos;
84 Some(content[start_pos..end_pos].trim())
85}
86
87pub fn extract_attribute_value(content: &str, attribute: &str) -> Option<String> {
98 let pattern = format!("{attribute}=\"");
99 if let Some(attr_start) = content.find(&pattern) {
100 let value_start = attr_start + pattern.len();
101 if let Some(value_end) = content[value_start..].find('"') {
102 return Some(content[value_start..value_start + value_end].to_string());
103 }
104 }
105 None
106}
107
108pub fn strip_xml_tags(content: &str) -> String {
120 let bytes = content.as_bytes();
121 let mut result = Vec::with_capacity(bytes.len());
122 let mut in_tag = false;
123
124 for &b in bytes {
125 match b {
126 b'<' => in_tag = true,
127 b'>' => in_tag = false,
128 _ if !in_tag => result.push(b),
129 _ => {}
130 }
131 }
132
133 let s = unsafe { String::from_utf8_unchecked(result) };
137
138 let trimmed = s.trim();
140 if trimmed.len() == s.len() {
141 s
142 } else {
143 trimmed.to_string()
144 }
145}
146
147pub fn find_all_tags(content: &str, tag: &str) -> Vec<String> {
160 let mut results = Vec::new();
161 let start_tag = format!("<{}", tag);
162 let end_tag = format!("</{}>", tag);
163
164 let mut pos = 0;
165 while let Some(start_pos) = content[pos..].find(&start_tag) {
166 let start_pos = pos + start_pos;
167
168 if let Some(tag_end) = content[start_pos..].find(">") {
170 let tag_end = start_pos + tag_end + 1;
171
172 if let Some(end_pos) = content[tag_end..].find(&end_tag) {
174 let end_pos = tag_end + end_pos;
175 let tag_content = content[start_pos..end_pos + end_tag.len()].to_string();
176 results.push(tag_content);
177 pos = end_pos;
178 } else {
179 break;
180 }
181 } else {
182 break;
183 }
184 }
185
186 results
187}
188
189pub fn extract_all_text_between(content: &str, start: &str, end: &str) -> Vec<String> {
203 let mut results = Vec::new();
204 let mut pos = 0;
205
206 while let Some(start_pos) = content[pos..].find(start) {
207 let start_pos = pos + start_pos + start.len();
208 if let Some(end_pos) = content[start_pos..].find(end) {
209 let end_pos = start_pos + end_pos;
210 let text = content[start_pos..end_pos].trim().to_string();
211 if !text.is_empty() {
212 results.push(text);
213 }
214 pos = end_pos;
215 } else {
216 break;
217 }
218 }
219
220 results
221}
222
223pub fn extract_element_content(content: &str, tag: &str) -> Option<String> {
236 let start_tag = format!("<{}", tag);
237 let end_tag = format!("</{}>", tag);
238
239 if let Some(start_pos) = content.find(&start_tag)
240 && let Some(tag_end) = content[start_pos..].find(">")
241 {
242 let content_start = start_pos + tag_end + 1;
243 if let Some(end_pos) = content[content_start..].find(&end_tag) {
244 let content_end = content_start + end_pos;
245 return Some(content[content_start..content_end].to_string());
246 }
247 }
248
249 None
250}
251
252pub fn extract_all_attributes(tag: &str) -> HashMap<String, String> {
264 let mut attributes = HashMap::new();
265
266 if let Some(start) = tag.find('<')
268 && let Some(end) = tag[start..].find('>')
269 {
270 let tag_content = &tag[start + 1..start + end];
271
272 if let Some(space_pos) = tag_content.find(' ') {
274 let attrs_part = &tag_content[space_pos + 1..];
275 let bytes = attrs_part.as_bytes();
276 let len = bytes.len();
277
278 let mut pos = 0;
280 while pos < len {
281 while pos < len && bytes[pos].is_ascii_whitespace() {
283 pos += 1;
284 }
285
286 if pos >= len {
287 break;
288 }
289
290 let name_start = pos;
292 while pos < len && bytes[pos] != b'=' && !bytes[pos].is_ascii_whitespace() {
293 pos += 1;
294 }
295
296 if pos >= len {
297 break;
298 }
299
300 let attr_name = &attrs_part[name_start..pos];
301
302 while pos < len {
304 if bytes[pos] == b'=' {
305 pos += 1;
306 break;
307 } else if bytes[pos].is_ascii_whitespace() {
308 pos += 1;
309 } else {
310 break;
311 }
312 }
313
314 while pos < len && bytes[pos].is_ascii_whitespace() {
316 pos += 1;
317 }
318
319 if pos >= len {
320 break;
321 }
322
323 let quote_byte = bytes[pos];
325 if quote_byte == b'"' || quote_byte == b'\'' {
326 pos += 1; let value_start = pos;
328 while pos < len && bytes[pos] != quote_byte {
329 pos += 1;
330 }
331 if pos < len {
332 attributes.insert(
333 attr_name.to_string(),
334 attrs_part[value_start..pos].to_string(),
335 );
336 pos += 1; }
338 }
339 }
340 }
341 }
342
343 attributes
344}
345
346pub fn decode_xml_entities(content: &str) -> Cow<'_, str> {
351 if !content.contains('&') {
352 return Cow::Borrowed(content);
353 }
354
355 let mut result = String::with_capacity(content.len());
356 let mut chars = content.chars().peekable();
357
358 while let Some(c) = chars.next() {
359 if c == '&' {
360 let mut entity = String::new();
362 let mut found_semicolon = false;
363 for ec in chars.by_ref() {
364 if ec == ';' {
365 found_semicolon = true;
366 break;
367 }
368 entity.push(ec);
369 if entity.len() > 10 {
370 break;
371 }
372 }
373
374 if found_semicolon {
375 match entity.as_str() {
376 "amp" => result.push('&'),
377 "lt" => result.push('<'),
378 "gt" => result.push('>'),
379 "quot" => result.push('"'),
380 "apos" => result.push('\''),
381 s if s.starts_with('#') => {
382 let code = if s.starts_with("#x") || s.starts_with("#X") {
383 u32::from_str_radix(&s[2..], 16).ok()
384 } else {
385 s[1..].parse::<u32>().ok()
386 };
387 if let Some(ch) = code.and_then(char::from_u32) {
388 result.push(ch);
389 } else {
390 result.push('&');
392 result.push_str(&entity);
393 result.push(';');
394 }
395 }
396 _ => {
397 result.push('&');
399 result.push_str(&entity);
400 result.push(';');
401 }
402 }
403 } else {
404 result.push('&');
406 result.push_str(&entity);
407 }
408 } else {
409 result.push(c);
410 }
411 }
412
413 Cow::Owned(result)
414}
415
416pub fn is_self_closing_tag(tag: &str) -> bool {
426 tag.trim_end().ends_with("/>")
427}
428
429pub fn extract_section_text(content: &str, section_tag: &str) -> Option<String> {
442 extract_element_content(content, section_tag)
443 .map(|section_content| strip_xml_tags(§ion_content))
444}
445
446#[cfg(test)]
447mod tests {
448 use super::*;
449
450 #[test]
451 fn test_strip_inline_html_tags() {
452 let xml_with_sup = r#"<AbstractText>CO<sup>2</sup> levels</AbstractText>"#;
454 let cleaned = strip_inline_html_tags(xml_with_sup);
455 assert!(
456 !cleaned.contains("<sup>"),
457 "Cleaned XML still contains <sup>: {}",
458 cleaned
459 );
460 assert!(
461 !cleaned.contains("</sup>"),
462 "Cleaned XML still contains </sup>: {}",
463 cleaned
464 );
465 assert!(cleaned.contains("CO2 levels"));
466
467 let xml_with_i = r#"<AbstractText>The <i>e.g.</i> example</AbstractText>"#;
469 let cleaned = strip_inline_html_tags(xml_with_i);
470 assert!(!cleaned.contains("<i>"));
471 assert!(!cleaned.contains("</i>"));
472 assert!(cleaned.contains("e.g."));
473
474 let xml_with_sub = r#"<AbstractText>H<sub>2</sub>O</AbstractText>"#;
476 let cleaned = strip_inline_html_tags(xml_with_sub);
477 assert!(!cleaned.contains("<sub>"));
478 assert!(!cleaned.contains("</sub>"));
479 assert!(cleaned.contains("H2O"));
480
481 let xml_with_mixed = r#"<Article><Title>CO<sup>2</sup> Study</Title></Article>"#;
483 let cleaned = strip_inline_html_tags(xml_with_mixed);
484 assert!(cleaned.contains("<Article>"));
485 assert!(cleaned.contains("</Article>"));
486 assert!(cleaned.contains("<Title>"));
487 assert!(!cleaned.contains("<sup>"));
488 }
489
490 #[test]
491 fn test_extract_text_between() {
492 let content = "<title>Test Title</title>";
493 let result = extract_text_between(content, "<title>", "</title>");
494 assert_eq!(result, Some("Test Title".to_string()));
495 }
496
497 #[test]
498 fn test_extract_attribute_value() {
499 let content = r#"<element id="test-id" class="test-class">"#;
500 let result = extract_attribute_value(content, "id");
501 assert_eq!(result, Some("test-id".to_string()));
502 }
503
504 #[test]
505 fn test_strip_xml_tags() {
506 let content = "<p>This is <b>bold</b> text</p>";
507 let result = strip_xml_tags(content);
508 assert_eq!(result, "This is bold text");
509 }
510
511 #[test]
512 fn test_find_all_tags() {
513 let content = "<p>First paragraph</p><p>Second paragraph</p>";
514 let results = find_all_tags(content, "p");
515 assert_eq!(results.len(), 2);
516 assert_eq!(results[0], "<p>First paragraph</p>");
517 assert_eq!(results[1], "<p>Second paragraph</p>");
518 }
519
520 #[test]
521 fn test_extract_all_text_between() {
522 let content = "<p>First</p><p>Second</p><p>Third</p>";
523 let results = extract_all_text_between(content, "<p>", "</p>");
524 assert_eq!(results, vec!["First", "Second", "Third"]);
525 }
526
527 #[test]
528 fn test_extract_element_content() {
529 let content = "<section><title>Test</title><p>Content</p></section>";
530 let result = extract_element_content(content, "section");
531 assert_eq!(
532 result,
533 Some("<title>Test</title><p>Content</p>".to_string())
534 );
535 }
536
537 #[test]
538 fn test_is_self_closing_tag() {
539 assert!(is_self_closing_tag("<img src=\"test.jpg\"/>"));
540 assert!(!is_self_closing_tag("<img src=\"test.jpg\">"));
541 }
542
543 #[test]
544 fn test_extract_all_attributes() {
545 let tag = r#"<element id="test-id" class="test-class" data-value="123">"#;
546 let attributes = extract_all_attributes(tag);
547
548 assert_eq!(attributes.get("id"), Some(&"test-id".to_string()));
549 assert_eq!(attributes.get("class"), Some(&"test-class".to_string()));
550 assert_eq!(attributes.get("data-value"), Some(&"123".to_string()));
551 }
552
553 #[test]
554 fn test_extract_section_text() {
555 let content = "<section><title>Test</title><p>Content</p></section>";
556 let result = extract_section_text(content, "section");
557 assert_eq!(result, Some("TestContent".to_string()));
559 }
560
561 #[test]
562 fn test_decode_xml_entities() {
563 assert_eq!(decode_xml_entities("&"), "&");
565 assert_eq!(decode_xml_entities("<"), "<");
566 assert_eq!(decode_xml_entities(">"), ">");
567 assert_eq!(decode_xml_entities("""), "\"");
568 assert_eq!(decode_xml_entities("'"), "'");
569
570 assert_eq!(decode_xml_entities("©"), "©");
572 assert_eq!(decode_xml_entities("ç"), "ç");
573 assert_eq!(decode_xml_entities("Á"), "Á");
574
575 assert_eq!(decode_xml_entities("©"), "©");
577 assert_eq!(decode_xml_entities("©"), "©");
578
579 let result = decode_xml_entities("no entities here");
581 assert!(matches!(result, Cow::Borrowed(_)));
582
583 assert_eq!(
585 decode_xml_entities("© 2021 François & Co"),
586 "© 2021 François & Co"
587 );
588 }
589}