pubmed_parser/common/
ids.rs

1//! ID validation and cleaning utilities for PubMed and PMC identifiers
2//!
3//! This module provides strongly-typed, validated ID types for PubMed IDs (PMIDs)
4//! and PubMed Central IDs (PMC IDs).
5
6use crate::error::{ParseError, Result};
7use serde::{Deserialize, Serialize};
8use std::fmt;
9use std::str::FromStr;
10
11/// A validated PubMed ID (PMID)
12///
13/// PMIDs are numeric identifiers for articles in the PubMed database.
14/// This type ensures that the ID is valid and provides methods for
15/// parsing, cleaning, and converting between different representations.
16///
17/// # Examples
18///
19/// ```
20/// use pubmed_parser::common::PubMedId;
21///
22/// // Parse from string
23/// let pmid = PubMedId::parse("31978945").unwrap();
24/// assert_eq!(pmid.as_u32(), 31978945);
25/// assert_eq!(pmid.as_str(), "31978945");
26///
27/// // Parse with whitespace (automatically cleaned)
28/// let pmid = PubMedId::parse("  31978945  ").unwrap();
29/// assert_eq!(pmid.as_u32(), 31978945);
30///
31/// // From u32
32/// let pmid = PubMedId::from_u32(31978945);
33/// assert_eq!(pmid.to_string(), "31978945");
34/// ```
35#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
36pub struct PubMedId {
37    value: u32,
38}
39
40impl PubMedId {
41    /// Parse a PMID from a string
42    ///
43    /// The input is automatically trimmed of whitespace.
44    ///
45    /// # Errors
46    ///
47    /// Returns `ParseError::InvalidPmid` if:
48    /// - The string is empty after trimming
49    /// - The string contains non-numeric characters
50    /// - The number is zero
51    /// - The number is too large to fit in a u32
52    ///
53    /// # Examples
54    ///
55    /// ```
56    /// use pubmed_parser::common::PubMedId;
57    ///
58    /// let pmid = PubMedId::parse("31978945").unwrap();
59    /// assert_eq!(pmid.as_u32(), 31978945);
60    ///
61    /// // With whitespace
62    /// let pmid = PubMedId::parse("  31978945  ").unwrap();
63    /// assert_eq!(pmid.as_u32(), 31978945);
64    ///
65    /// // Invalid cases
66    /// assert!(PubMedId::parse("").is_err());
67    /// assert!(PubMedId::parse("abc").is_err());
68    /// assert!(PubMedId::parse("0").is_err());
69    /// assert!(PubMedId::parse("-123").is_err());
70    /// ```
71    pub fn parse(s: &str) -> Result<Self> {
72        let trimmed = s.trim();
73
74        if trimmed.is_empty() {
75            return Err(ParseError::InvalidPmid {
76                pmid: s.to_string(),
77            });
78        }
79
80        // Parse as u32
81        let value = trimmed
82            .parse::<u32>()
83            .map_err(|_| ParseError::InvalidPmid {
84                pmid: s.to_string(),
85            })?;
86
87        // PMIDs should be positive (non-zero)
88        if value == 0 {
89            return Err(ParseError::InvalidPmid {
90                pmid: s.to_string(),
91            });
92        }
93
94        Ok(Self { value })
95    }
96
97    /// Create a PubMedId from a u32 value
98    ///
99    /// # Panics
100    ///
101    /// Panics if the value is zero.
102    ///
103    /// # Examples
104    ///
105    /// ```
106    /// use pubmed_parser::common::PubMedId;
107    ///
108    /// let pmid = PubMedId::from_u32(31978945);
109    /// assert_eq!(pmid.as_u32(), 31978945);
110    /// ```
111    pub fn from_u32(value: u32) -> Self {
112        assert!(value > 0, "PMID must be greater than zero");
113        Self { value }
114    }
115
116    /// Try to create a PubMedId from a u32 value
117    ///
118    /// # Errors
119    ///
120    /// Returns `ParseError::InvalidPmid` if the value is zero.
121    ///
122    /// # Examples
123    ///
124    /// ```
125    /// use pubmed_parser::common::PubMedId;
126    ///
127    /// let pmid = PubMedId::try_from_u32(31978945).unwrap();
128    /// assert_eq!(pmid.as_u32(), 31978945);
129    ///
130    /// assert!(PubMedId::try_from_u32(0).is_err());
131    /// ```
132    pub fn try_from_u32(value: u32) -> Result<Self> {
133        if value == 0 {
134            return Err(ParseError::InvalidPmid {
135                pmid: value.to_string(),
136            });
137        }
138        Ok(Self { value })
139    }
140
141    /// Get the PMID as a u32
142    ///
143    /// # Examples
144    ///
145    /// ```
146    /// use pubmed_parser::common::PubMedId;
147    ///
148    /// let pmid = PubMedId::parse("31978945").unwrap();
149    /// assert_eq!(pmid.as_u32(), 31978945);
150    /// ```
151    pub fn as_u32(&self) -> u32 {
152        self.value
153    }
154
155    /// Get the PMID as a string slice
156    ///
157    /// Note: This creates a temporary String and returns it.
158    /// For owned String, use `to_string()`.
159    ///
160    /// # Examples
161    ///
162    /// ```
163    /// use pubmed_parser::common::PubMedId;
164    ///
165    /// let pmid = PubMedId::from_u32(31978945);
166    /// assert_eq!(pmid.as_str(), "31978945");
167    /// ```
168    pub fn as_str(&self) -> String {
169        self.value.to_string()
170    }
171}
172
173impl fmt::Display for PubMedId {
174    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
175        write!(f, "{}", self.value)
176    }
177}
178
179impl FromStr for PubMedId {
180    type Err = ParseError;
181
182    fn from_str(s: &str) -> Result<Self> {
183        Self::parse(s)
184    }
185}
186
187impl From<u32> for PubMedId {
188    fn from(value: u32) -> Self {
189        Self::from_u32(value)
190    }
191}
192
193impl From<PubMedId> for u32 {
194    fn from(pmid: PubMedId) -> Self {
195        pmid.value
196    }
197}
198
199/// A validated PubMed Central ID (PMC ID)
200///
201/// PMC IDs are identifiers for full-text articles in the PMC database.
202/// They consist of the prefix "PMC" followed by numeric digits.
203/// This type ensures that the ID is valid and provides methods for
204/// parsing, cleaning, and normalizing the ID format.
205///
206/// # Examples
207///
208/// ```
209/// use pubmed_parser::common::PmcId;
210///
211/// // Parse with PMC prefix
212/// let pmcid = PmcId::parse("PMC7906746").unwrap();
213/// assert_eq!(pmcid.as_str(), "PMC7906746");
214/// assert_eq!(pmcid.numeric_part(), 7906746);
215///
216/// // Parse without PMC prefix (automatically added)
217/// let pmcid = PmcId::parse("7906746").unwrap();
218/// assert_eq!(pmcid.as_str(), "PMC7906746");
219///
220/// // With whitespace (automatically cleaned)
221/// let pmcid = PmcId::parse("  PMC7906746  ").unwrap();
222/// assert_eq!(pmcid.as_str(), "PMC7906746");
223/// ```
224#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
225pub struct PmcId {
226    value: u32,
227}
228
229impl PmcId {
230    /// Parse a PMC ID from a string
231    ///
232    /// The input is automatically trimmed of whitespace and the "PMC" prefix
233    /// is added if not present. Case-insensitive parsing is supported.
234    ///
235    /// # Errors
236    ///
237    /// Returns `ParseError::InvalidPmcid` if:
238    /// - The string is empty after trimming
239    /// - The numeric part contains non-numeric characters
240    /// - The numeric part is zero
241    /// - The number is too large to fit in a u32
242    ///
243    /// # Examples
244    ///
245    /// ```
246    /// use pubmed_parser::common::PmcId;
247    ///
248    /// // With PMC prefix
249    /// let pmcid = PmcId::parse("PMC7906746").unwrap();
250    /// assert_eq!(pmcid.as_str(), "PMC7906746");
251    ///
252    /// // Without PMC prefix
253    /// let pmcid = PmcId::parse("7906746").unwrap();
254    /// assert_eq!(pmcid.as_str(), "PMC7906746");
255    ///
256    /// // Case insensitive
257    /// let pmcid = PmcId::parse("pmc7906746").unwrap();
258    /// assert_eq!(pmcid.as_str(), "PMC7906746");
259    ///
260    /// // With whitespace
261    /// let pmcid = PmcId::parse("  PMC7906746  ").unwrap();
262    /// assert_eq!(pmcid.as_str(), "PMC7906746");
263    ///
264    /// // Invalid cases
265    /// assert!(PmcId::parse("").is_err());
266    /// assert!(PmcId::parse("PMC").is_err());
267    /// assert!(PmcId::parse("PMC0").is_err());
268    /// assert!(PmcId::parse("PMCabc").is_err());
269    /// ```
270    pub fn parse(s: &str) -> Result<Self> {
271        let trimmed = s.trim();
272
273        if trimmed.is_empty() {
274            return Err(ParseError::InvalidPmcid {
275                pmcid: s.to_string(),
276            });
277        }
278
279        // Remove PMC prefix if present (case-insensitive)
280        let numeric_part = if trimmed.len() >= 3 && trimmed[0..3].eq_ignore_ascii_case("PMC") {
281            &trimmed[3..]
282        } else {
283            trimmed
284        };
285
286        // Check if numeric part is empty
287        if numeric_part.is_empty() {
288            return Err(ParseError::InvalidPmcid {
289                pmcid: s.to_string(),
290            });
291        }
292
293        // Parse numeric part as u32
294        let value = numeric_part
295            .parse::<u32>()
296            .map_err(|_| ParseError::InvalidPmcid {
297                pmcid: s.to_string(),
298            })?;
299
300        // PMC IDs should be positive (non-zero)
301        if value == 0 {
302            return Err(ParseError::InvalidPmcid {
303                pmcid: s.to_string(),
304            });
305        }
306
307        Ok(Self { value })
308    }
309
310    /// Create a PmcId from a u32 value
311    ///
312    /// # Panics
313    ///
314    /// Panics if the value is zero.
315    ///
316    /// # Examples
317    ///
318    /// ```
319    /// use pubmed_parser::common::PmcId;
320    ///
321    /// let pmcid = PmcId::from_u32(7906746);
322    /// assert_eq!(pmcid.as_str(), "PMC7906746");
323    /// assert_eq!(pmcid.numeric_part(), 7906746);
324    /// ```
325    pub fn from_u32(value: u32) -> Self {
326        assert!(value > 0, "PMC ID numeric part must be greater than zero");
327        Self { value }
328    }
329
330    /// Try to create a PmcId from a u32 value
331    ///
332    /// # Errors
333    ///
334    /// Returns `ParseError::InvalidPmcid` if the value is zero.
335    ///
336    /// # Examples
337    ///
338    /// ```
339    /// use pubmed_parser::common::PmcId;
340    ///
341    /// let pmcid = PmcId::try_from_u32(7906746).unwrap();
342    /// assert_eq!(pmcid.numeric_part(), 7906746);
343    ///
344    /// assert!(PmcId::try_from_u32(0).is_err());
345    /// ```
346    pub fn try_from_u32(value: u32) -> Result<Self> {
347        if value == 0 {
348            return Err(ParseError::InvalidPmcid {
349                pmcid: value.to_string(),
350            });
351        }
352        Ok(Self { value })
353    }
354
355    /// Get the full PMC ID as a string (with "PMC" prefix)
356    ///
357    /// # Examples
358    ///
359    /// ```
360    /// use pubmed_parser::common::PmcId;
361    ///
362    /// let pmcid = PmcId::from_u32(7906746);
363    /// assert_eq!(pmcid.as_str(), "PMC7906746");
364    /// ```
365    pub fn as_str(&self) -> String {
366        format!("PMC{}", self.value)
367    }
368
369    /// Get the numeric part of the PMC ID (without "PMC" prefix)
370    ///
371    /// # Examples
372    ///
373    /// ```
374    /// use pubmed_parser::common::PmcId;
375    ///
376    /// let pmcid = PmcId::parse("PMC7906746").unwrap();
377    /// assert_eq!(pmcid.numeric_part(), 7906746);
378    /// ```
379    pub fn numeric_part(&self) -> u32 {
380        self.value
381    }
382
383    /// Get the numeric part as a string (without "PMC" prefix)
384    ///
385    /// # Examples
386    ///
387    /// ```
388    /// use pubmed_parser::common::PmcId;
389    ///
390    /// let pmcid = PmcId::parse("PMC7906746").unwrap();
391    /// assert_eq!(pmcid.numeric_part_str(), "7906746");
392    /// ```
393    pub fn numeric_part_str(&self) -> String {
394        self.value.to_string()
395    }
396}
397
398impl fmt::Display for PmcId {
399    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
400        write!(f, "PMC{}", self.value)
401    }
402}
403
404impl FromStr for PmcId {
405    type Err = ParseError;
406
407    fn from_str(s: &str) -> Result<Self> {
408        Self::parse(s)
409    }
410}
411
412impl From<u32> for PmcId {
413    fn from(value: u32) -> Self {
414        Self::from_u32(value)
415    }
416}
417
418#[cfg(test)]
419mod tests {
420    use super::*;
421
422    // PubMedId Tests
423
424    #[test]
425    fn test_pubmedid_parse_valid() {
426        let pmid = PubMedId::parse("31978945").unwrap();
427        assert_eq!(pmid.as_u32(), 31978945);
428        assert_eq!(pmid.as_str(), "31978945");
429    }
430
431    #[test]
432    fn test_pubmedid_parse_with_whitespace() {
433        let pmid = PubMedId::parse("  31978945  ").unwrap();
434        assert_eq!(pmid.as_u32(), 31978945);
435    }
436
437    #[test]
438    fn test_pubmedid_parse_empty() {
439        assert!(PubMedId::parse("").is_err());
440        assert!(PubMedId::parse("   ").is_err());
441    }
442
443    #[test]
444    fn test_pubmedid_parse_non_numeric() {
445        assert!(PubMedId::parse("abc").is_err());
446        assert!(PubMedId::parse("123abc").is_err());
447        assert!(PubMedId::parse("12.34").is_err());
448    }
449
450    #[test]
451    fn test_pubmedid_parse_zero() {
452        assert!(PubMedId::parse("0").is_err());
453    }
454
455    #[test]
456    fn test_pubmedid_parse_negative() {
457        assert!(PubMedId::parse("-123").is_err());
458    }
459
460    #[test]
461    fn test_pubmedid_from_u32() {
462        let pmid = PubMedId::from_u32(31978945);
463        assert_eq!(pmid.as_u32(), 31978945);
464    }
465
466    #[test]
467    #[should_panic(expected = "PMID must be greater than zero")]
468    fn test_pubmedid_from_u32_zero_panics() {
469        PubMedId::from_u32(0);
470    }
471
472    #[test]
473    fn test_pubmedid_try_from_u32() {
474        let pmid = PubMedId::try_from_u32(31978945).unwrap();
475        assert_eq!(pmid.as_u32(), 31978945);
476        assert!(PubMedId::try_from_u32(0).is_err());
477    }
478
479    #[test]
480    fn test_pubmedid_display() {
481        let pmid = PubMedId::from_u32(31978945);
482        assert_eq!(format!("{}", pmid), "31978945");
483    }
484
485    #[test]
486    fn test_pubmedid_from_str_trait() {
487        let pmid: PubMedId = "31978945".parse().unwrap();
488        assert_eq!(pmid.as_u32(), 31978945);
489    }
490
491    #[test]
492    fn test_pubmedid_conversions() {
493        let pmid = PubMedId::from_u32(31978945);
494        let value: u32 = pmid.into();
495        assert_eq!(value, 31978945);
496    }
497
498    // PmcId Tests
499
500    #[test]
501    fn test_pmcid_parse_with_prefix() {
502        let pmcid = PmcId::parse("PMC7906746").unwrap();
503        assert_eq!(pmcid.as_str(), "PMC7906746");
504        assert_eq!(pmcid.numeric_part(), 7906746);
505    }
506
507    #[test]
508    fn test_pmcid_parse_without_prefix() {
509        let pmcid = PmcId::parse("7906746").unwrap();
510        assert_eq!(pmcid.as_str(), "PMC7906746");
511        assert_eq!(pmcid.numeric_part(), 7906746);
512    }
513
514    #[test]
515    fn test_pmcid_parse_case_insensitive() {
516        let pmcid1 = PmcId::parse("pmc7906746").unwrap();
517        let pmcid2 = PmcId::parse("Pmc7906746").unwrap();
518        let pmcid3 = PmcId::parse("PMC7906746").unwrap();
519
520        assert_eq!(pmcid1, pmcid2);
521        assert_eq!(pmcid2, pmcid3);
522        assert_eq!(pmcid1.as_str(), "PMC7906746");
523    }
524
525    #[test]
526    fn test_pmcid_parse_with_whitespace() {
527        let pmcid = PmcId::parse("  PMC7906746  ").unwrap();
528        assert_eq!(pmcid.as_str(), "PMC7906746");
529
530        let pmcid = PmcId::parse("  7906746  ").unwrap();
531        assert_eq!(pmcid.as_str(), "PMC7906746");
532    }
533
534    #[test]
535    fn test_pmcid_parse_empty() {
536        assert!(PmcId::parse("").is_err());
537        assert!(PmcId::parse("   ").is_err());
538        assert!(PmcId::parse("PMC").is_err());
539    }
540
541    #[test]
542    fn test_pmcid_parse_non_numeric() {
543        assert!(PmcId::parse("PMCabc").is_err());
544        assert!(PmcId::parse("PMC123abc").is_err());
545        assert!(PmcId::parse("abc").is_err());
546    }
547
548    #[test]
549    fn test_pmcid_parse_zero() {
550        assert!(PmcId::parse("PMC0").is_err());
551        assert!(PmcId::parse("0").is_err());
552    }
553
554    #[test]
555    fn test_pmcid_from_u32() {
556        let pmcid = PmcId::from_u32(7906746);
557        assert_eq!(pmcid.as_str(), "PMC7906746");
558        assert_eq!(pmcid.numeric_part(), 7906746);
559    }
560
561    #[test]
562    #[should_panic(expected = "PMC ID numeric part must be greater than zero")]
563    fn test_pmcid_from_u32_zero_panics() {
564        PmcId::from_u32(0);
565    }
566
567    #[test]
568    fn test_pmcid_try_from_u32() {
569        let pmcid = PmcId::try_from_u32(7906746).unwrap();
570        assert_eq!(pmcid.numeric_part(), 7906746);
571        assert!(PmcId::try_from_u32(0).is_err());
572    }
573
574    #[test]
575    fn test_pmcid_numeric_part_str() {
576        let pmcid = PmcId::parse("PMC7906746").unwrap();
577        assert_eq!(pmcid.numeric_part_str(), "7906746");
578    }
579
580    #[test]
581    fn test_pmcid_display() {
582        let pmcid = PmcId::from_u32(7906746);
583        assert_eq!(format!("{}", pmcid), "PMC7906746");
584    }
585
586    #[test]
587    fn test_pmcid_from_str_trait() {
588        let pmcid: PmcId = "PMC7906746".parse().unwrap();
589        assert_eq!(pmcid.numeric_part(), 7906746);
590
591        let pmcid: PmcId = "7906746".parse().unwrap();
592        assert_eq!(pmcid.as_str(), "PMC7906746");
593    }
594
595    #[test]
596    fn test_pmcid_equality() {
597        let pmcid1 = PmcId::parse("PMC7906746").unwrap();
598        let pmcid2 = PmcId::parse("7906746").unwrap();
599        let pmcid3 = PmcId::from_u32(7906746);
600
601        assert_eq!(pmcid1, pmcid2);
602        assert_eq!(pmcid2, pmcid3);
603    }
604
605    #[test]
606    fn test_pmcid_hash() {
607        use std::collections::HashSet;
608
609        let mut set = HashSet::new();
610        set.insert(PmcId::parse("PMC7906746").unwrap());
611        set.insert(PmcId::parse("7906746").unwrap());
612
613        // Should only contain one item since they're equal
614        assert_eq!(set.len(), 1);
615    }
616
617    // Real-world examples from the codebase
618
619    #[test]
620    fn test_real_world_pmids() {
621        let test_cases = vec![
622            "31978945", // COVID-19 research
623            "25760099", // CRISPR-Cas9
624            "33515491", // Cancer treatment
625            "12345678",
626        ];
627
628        for pmid_str in test_cases {
629            let pmid = PubMedId::parse(pmid_str).unwrap();
630            assert_eq!(pmid.as_str(), pmid_str);
631        }
632    }
633
634    #[test]
635    fn test_real_world_pmcids() {
636        let test_cases = vec![
637            ("PMC7906746", "PMC7906746"),
638            ("PMC10618641", "PMC10618641"),
639            ("PMC10000000", "PMC10000000"),
640            ("7906746", "PMC7906746"),   // Without prefix
641            ("10618641", "PMC10618641"), // Without prefix
642        ];
643
644        for (input, expected) in test_cases {
645            let pmcid = PmcId::parse(input).unwrap();
646            assert_eq!(pmcid.as_str(), expected);
647        }
648    }
649
650    #[test]
651    fn test_serialization() {
652        let pmid = PubMedId::from_u32(31978945);
653        let json = serde_json::to_string(&pmid).unwrap();
654        let deserialized: PubMedId = serde_json::from_str(&json).unwrap();
655        assert_eq!(pmid, deserialized);
656
657        let pmcid = PmcId::from_u32(7906746);
658        let json = serde_json::to_string(&pmcid).unwrap();
659        let deserialized: PmcId = serde_json::from_str(&json).unwrap();
660        assert_eq!(pmcid, deserialized);
661    }
662}