pubmed_client/pubmed/client/
espell.rs

1//! ESpell API operations for spell-checking search terms
2
3use crate::error::{PubMedError, Result};
4use crate::pubmed::models::{SpellCheckResult, SpelledQuerySegment};
5use tracing::{debug, info, instrument};
6
7use super::PubMedClient;
8
9impl PubMedClient {
10    /// Check spelling of a search term using the ESpell API
11    ///
12    /// Provides spelling suggestions for terms within a single text query.
13    /// Useful as a preprocessing step before executing actual searches to improve
14    /// search accuracy.
15    ///
16    /// # Arguments
17    ///
18    /// * `term` - The search term to spell-check
19    ///
20    /// # Returns
21    ///
22    /// Returns a `Result<SpellCheckResult>` containing the original query,
23    /// corrected query, and detailed information about which terms were corrected.
24    ///
25    /// # Example
26    ///
27    /// ```no_run
28    /// use pubmed_client::PubMedClient;
29    ///
30    /// #[tokio::main]
31    /// async fn main() -> Result<(), Box<dyn std::error::Error>> {
32    ///     let client = PubMedClient::new();
33    ///     let result = client.spell_check("asthmaa OR alergies").await?;
34    ///
35    ///     println!("Original: {}", result.query);
36    ///     println!("Corrected: {}", result.corrected_query);
37    ///
38    ///     if result.has_corrections() {
39    ///         println!("Replacements: {:?}", result.replacements());
40    ///     }
41    ///
42    ///     Ok(())
43    /// }
44    /// ```
45    #[instrument(skip(self), fields(term = %term))]
46    pub async fn spell_check(&self, term: &str) -> Result<SpellCheckResult> {
47        self.spell_check_db(term, "pubmed").await
48    }
49
50    /// Check spelling of a search term against a specific database using the ESpell API
51    ///
52    /// Spelling suggestions are database-specific, so use the same database you plan to search.
53    ///
54    /// # Arguments
55    ///
56    /// * `term` - The search term to spell-check
57    /// * `db` - The NCBI database to check against (e.g., "pubmed", "pmc")
58    ///
59    /// # Returns
60    ///
61    /// Returns a `Result<SpellCheckResult>` containing spelling suggestions
62    ///
63    /// # Example
64    ///
65    /// ```no_run
66    /// use pubmed_client::PubMedClient;
67    ///
68    /// #[tokio::main]
69    /// async fn main() -> Result<(), Box<dyn std::error::Error>> {
70    ///     let client = PubMedClient::new();
71    ///     let result = client.spell_check_db("fiberblast cell grwth", "pmc").await?;
72    ///     println!("Corrected: {}", result.corrected_query);
73    ///     Ok(())
74    /// }
75    /// ```
76    #[instrument(skip(self), fields(term = %term, db = %db))]
77    pub async fn spell_check_db(&self, term: &str, db: &str) -> Result<SpellCheckResult> {
78        let term = term.trim();
79        if term.is_empty() {
80            return Err(PubMedError::InvalidQuery(
81                "Search term cannot be empty".to_string(),
82            ));
83        }
84
85        let db = db.trim();
86        if db.is_empty() {
87            return Err(PubMedError::ApiError {
88                status: 400,
89                message: "Database name cannot be empty".to_string(),
90            });
91        }
92
93        let url = format!(
94            "{}/espell.fcgi?db={}&term={}",
95            self.base_url,
96            urlencoding::encode(db),
97            urlencoding::encode(term)
98        );
99
100        debug!(term = %term, db = %db, "Making ESpell API request");
101        let response = self.make_request(&url).await?;
102        let xml_text = response.text().await?;
103
104        let result = Self::parse_espell_response(&xml_text, term, db)?;
105
106        info!(
107            term = %term,
108            corrected = %result.corrected_query,
109            has_corrections = result.has_corrections(),
110            "ESpell completed"
111        );
112
113        Ok(result)
114    }
115
116    /// Parse ESpell XML response into SpellCheckResult
117    pub(crate) fn parse_espell_response(
118        xml: &str,
119        query_term: &str,
120        db: &str,
121    ) -> Result<SpellCheckResult> {
122        use crate::common::xml_utils::extract_text_between;
123
124        // Check for error
125        let error = extract_text_between(xml, "<ERROR>", "</ERROR>");
126        if let Some(error_msg) = error
127            && !error_msg.is_empty()
128        {
129            return Err(PubMedError::ApiError {
130                status: 200,
131                message: format!("NCBI ESpell API error: {}", error_msg),
132            });
133        }
134
135        let database = extract_text_between(xml, "<Database>", "</Database>")
136            .unwrap_or_else(|| db.to_string());
137
138        let query = extract_text_between(xml, "<Query>", "</Query>")
139            .unwrap_or_else(|| query_term.to_string());
140
141        let corrected_query =
142            extract_text_between(xml, "<CorrectedQuery>", "</CorrectedQuery>").unwrap_or_default();
143
144        // Parse SpelledQuery segments
145        let spelled_query = if let Some(spelled_content) =
146            extract_text_between(xml, "<SpelledQuery>", "</SpelledQuery>")
147        {
148            Self::parse_spelled_query_segments(&spelled_content)
149        } else {
150            Vec::new()
151        };
152
153        Ok(SpellCheckResult {
154            database,
155            query,
156            corrected_query,
157            spelled_query,
158        })
159    }
160
161    /// Parse the interleaved <Original> and <Replaced> elements from SpelledQuery
162    fn parse_spelled_query_segments(content: &str) -> Vec<SpelledQuerySegment> {
163        let mut segments = Vec::new();
164        let mut pos = 0;
165
166        while pos < content.len() {
167            let orig_pos = content[pos..].find("<Original>");
168            let repl_pos = content[pos..].find("<Replaced>");
169
170            match (orig_pos, repl_pos) {
171                (Some(o), Some(r)) if o <= r => {
172                    // <Original> comes first
173                    let abs_start = pos + o;
174                    if let Some(end_offset) = content[abs_start..].find("</Original>") {
175                        let text_start = abs_start + "<Original>".len();
176                        let text_end = abs_start + end_offset;
177                        segments.push(SpelledQuerySegment::Original(
178                            content[text_start..text_end].to_string(),
179                        ));
180                        pos = text_end + "</Original>".len();
181                    } else {
182                        break;
183                    }
184                }
185                (Some(_), Some(r)) => {
186                    // <Replaced> comes first
187                    let abs_start = pos + r;
188                    if let Some(end_offset) = content[abs_start..].find("</Replaced>") {
189                        let text_start = abs_start + "<Replaced>".len();
190                        let text_end = abs_start + end_offset;
191                        segments.push(SpelledQuerySegment::Replaced(
192                            content[text_start..text_end].to_string(),
193                        ));
194                        pos = text_end + "</Replaced>".len();
195                    } else {
196                        break;
197                    }
198                }
199                (Some(o), None) => {
200                    // Only <Original> remaining
201                    let abs_start = pos + o;
202                    if let Some(end_offset) = content[abs_start..].find("</Original>") {
203                        let text_start = abs_start + "<Original>".len();
204                        let text_end = abs_start + end_offset;
205                        segments.push(SpelledQuerySegment::Original(
206                            content[text_start..text_end].to_string(),
207                        ));
208                        pos = text_end + "</Original>".len();
209                    } else {
210                        break;
211                    }
212                }
213                (None, Some(r)) => {
214                    // Only <Replaced> remaining
215                    let abs_start = pos + r;
216                    if let Some(end_offset) = content[abs_start..].find("</Replaced>") {
217                        let text_start = abs_start + "<Replaced>".len();
218                        let text_end = abs_start + end_offset;
219                        segments.push(SpelledQuerySegment::Replaced(
220                            content[text_start..text_end].to_string(),
221                        ));
222                        pos = text_end + "</Replaced>".len();
223                    } else {
224                        break;
225                    }
226                }
227                (None, None) => break,
228            }
229        }
230
231        segments
232    }
233}
234
235#[cfg(test)]
236mod tests {
237    use super::*;
238
239    #[test]
240    fn test_parse_espell_response_with_corrections() {
241        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
242<eSpellResult>
243  <Database>pubmed</Database>
244  <Query>asthmaa OR alergies</Query>
245  <CorrectedQuery>asthma or allergies</CorrectedQuery>
246  <SpelledQuery>
247    <Original></Original>
248    <Replaced>asthma</Replaced>
249    <Original> OR </Original>
250    <Replaced>allergies</Replaced>
251  </SpelledQuery>
252  <ERROR/>
253</eSpellResult>"#;
254
255        let result =
256            PubMedClient::parse_espell_response(xml, "asthmaa OR alergies", "pubmed").unwrap();
257        assert_eq!(result.database, "pubmed");
258        assert_eq!(result.query, "asthmaa OR alergies");
259        assert_eq!(result.corrected_query, "asthma or allergies");
260        assert!(result.has_corrections());
261
262        let replacements = result.replacements();
263        assert_eq!(replacements.len(), 2);
264        assert_eq!(replacements[0], "asthma");
265        assert_eq!(replacements[1], "allergies");
266
267        assert_eq!(result.spelled_query.len(), 4);
268        assert_eq!(
269            result.spelled_query[0],
270            SpelledQuerySegment::Original("".to_string())
271        );
272        assert_eq!(
273            result.spelled_query[1],
274            SpelledQuerySegment::Replaced("asthma".to_string())
275        );
276        assert_eq!(
277            result.spelled_query[2],
278            SpelledQuerySegment::Original(" OR ".to_string())
279        );
280        assert_eq!(
281            result.spelled_query[3],
282            SpelledQuerySegment::Replaced("allergies".to_string())
283        );
284    }
285
286    #[test]
287    fn test_parse_espell_response_no_corrections() {
288        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
289<eSpellResult>
290  <Database>pubmed</Database>
291  <Query>asthma</Query>
292  <CorrectedQuery>asthma</CorrectedQuery>
293  <SpelledQuery>
294    <Original>asthma</Original>
295  </SpelledQuery>
296  <ERROR/>
297</eSpellResult>"#;
298
299        let result = PubMedClient::parse_espell_response(xml, "asthma", "pubmed").unwrap();
300        assert_eq!(result.query, "asthma");
301        assert_eq!(result.corrected_query, "asthma");
302        assert!(!result.has_corrections());
303        assert!(result.replacements().is_empty());
304    }
305
306    #[test]
307    fn test_parse_espell_response_empty_corrected() {
308        let xml = r#"<eSpellResult>
309  <Database>pubmed</Database>
310  <Query>xyznonexistent</Query>
311  <CorrectedQuery></CorrectedQuery>
312  <SpelledQuery/>
313  <ERROR/>
314</eSpellResult>"#;
315
316        let result = PubMedClient::parse_espell_response(xml, "xyznonexistent", "pubmed").unwrap();
317        assert_eq!(result.query, "xyznonexistent");
318        assert_eq!(result.corrected_query, "");
319    }
320
321    #[test]
322    fn test_parse_espell_response_pmc_database() {
323        let xml = r#"<eSpellResult>
324  <Database>pmc</Database>
325  <Query>fiberblast</Query>
326  <CorrectedQuery>fibroblast</CorrectedQuery>
327  <SpelledQuery>
328    <Replaced>fibroblast</Replaced>
329  </SpelledQuery>
330  <ERROR/>
331</eSpellResult>"#;
332
333        let result = PubMedClient::parse_espell_response(xml, "fiberblast", "pmc").unwrap();
334        assert_eq!(result.database, "pmc");
335        assert_eq!(result.corrected_query, "fibroblast");
336        assert!(result.has_corrections());
337    }
338
339    #[test]
340    fn test_spell_check_empty_term() {
341        use tokio_test;
342        let client = PubMedClient::new();
343        let result = tokio_test::block_on(client.spell_check(""));
344        assert!(result.is_err());
345    }
346
347    #[test]
348    fn test_spell_check_whitespace_term() {
349        use tokio_test;
350        let client = PubMedClient::new();
351        let result = tokio_test::block_on(client.spell_check("   "));
352        assert!(result.is_err());
353    }
354
355    #[test]
356    fn test_spell_check_db_empty_db() {
357        use tokio_test;
358        let client = PubMedClient::new();
359        let result = tokio_test::block_on(client.spell_check_db("asthma", ""));
360        assert!(result.is_err());
361    }
362}