pubmed_client/pubmed/query/
validation.rs

1//! Query validation and optimization methods
2
3use super::SearchQuery;
4use crate::error::{PubMedError, Result};
5
6impl SearchQuery {
7    /// Validate the query structure and parameters
8    ///
9    /// # Returns
10    ///
11    /// Returns an error if the query is invalid, Ok(()) otherwise
12    ///
13    /// # Example
14    ///
15    /// ```
16    /// use pubmed_client::pubmed::SearchQuery;
17    ///
18    /// let query = SearchQuery::new().query("covid-19");
19    /// assert!(query.validate().is_ok());
20    /// ```
21    pub fn validate(&self) -> Result<()> {
22        // Check if query is completely empty
23        if self.terms.is_empty() && self.filters.is_empty() {
24            return Err(PubMedError::InvalidQuery(
25                "Query cannot be empty".to_string(),
26            ));
27        }
28
29        // Validate limit is reasonable
30        if let Some(limit) = self.limit {
31            if limit == 0 {
32                return Err(PubMedError::InvalidQuery(
33                    "Limit must be greater than 0".to_string(),
34                ));
35            }
36            if limit > 10000 {
37                return Err(PubMedError::InvalidQuery(
38                    "Limit should not exceed 10,000 for performance reasons".to_string(),
39                ));
40            }
41        }
42
43        // Check for potentially problematic patterns
44        let query_string = self.build();
45        if query_string.len() > 4000 {
46            return Err(PubMedError::InvalidQuery(
47                "Query string is too long (>4000 characters)".to_string(),
48            ));
49        }
50
51        // Check for unbalanced parentheses
52        let open_parens = query_string.matches('(').count();
53        let close_parens = query_string.matches(')').count();
54        if open_parens != close_parens {
55            return Err(PubMedError::InvalidQuery(
56                "Unbalanced parentheses in query".to_string(),
57            ));
58        }
59
60        Ok(())
61    }
62
63    /// Optimize the query for better performance
64    ///
65    /// # Returns
66    ///
67    /// Returns an optimized version of the query
68    ///
69    /// # Example
70    ///
71    /// ```
72    /// use pubmed_client::pubmed::SearchQuery;
73    ///
74    /// let optimized = SearchQuery::new()
75    ///     .query("covid-19")
76    ///     .published_after(2020)
77    ///     .optimize();
78    /// ```
79    pub fn optimize(mut self) -> Self {
80        // Remove duplicate filters
81        self.filters.sort();
82        self.filters.dedup();
83
84        // Remove duplicate terms
85        self.terms.sort();
86        self.terms.dedup();
87
88        // Remove empty terms and filters
89        self.terms.retain(|term| !term.trim().is_empty());
90        self.filters.retain(|filter| !filter.trim().is_empty());
91
92        self
93    }
94
95    /// Get query statistics and information
96    ///
97    /// # Returns
98    ///
99    /// Returns a tuple of (term_count, filter_count, estimated_complexity)
100    ///
101    /// # Example
102    ///
103    /// ```
104    /// use pubmed_client::pubmed::SearchQuery;
105    ///
106    /// let query = SearchQuery::new()
107    ///     .query("machine learning")
108    ///     .published_after(2020)
109    ///     .free_full_text_only();
110    ///
111    /// let (terms, filters, complexity) = query.get_stats();
112    /// ```
113    pub fn get_stats(&self) -> (usize, usize, usize) {
114        let term_count = self.terms.len();
115        let filter_count = self.filters.len();
116
117        // Estimate complexity based on query structure
118        let query_string = self.build();
119        let complexity = query_string.matches(" AND ").count()
120            + query_string.matches(" OR ").count() * 2
121            + query_string.matches(" NOT ").count() * 2
122            + query_string.matches('(').count()
123            + 1; // Base complexity
124
125        (term_count, filter_count, complexity)
126    }
127}
128
129#[cfg(test)]
130mod tests {
131    use super::*;
132
133    #[test]
134    fn test_validate_empty_query() {
135        let query = SearchQuery::new();
136        assert!(query.validate().is_err());
137
138        if let Err(e) = query.validate() {
139            assert!(e.to_string().contains("Query cannot be empty"));
140        }
141    }
142
143    #[test]
144    fn test_validate_valid_query() {
145        let query = SearchQuery::new().query("covid-19");
146        assert!(query.validate().is_ok());
147    }
148
149    #[test]
150    fn test_validate_zero_limit() {
151        let query = SearchQuery::new().query("test").limit(0);
152        assert!(query.validate().is_err());
153
154        if let Err(e) = query.validate() {
155            assert!(e.to_string().contains("Limit must be greater than 0"));
156        }
157    }
158
159    #[test]
160    fn test_validate_excessive_limit() {
161        let query = SearchQuery::new().query("test").limit(20000);
162        assert!(query.validate().is_err());
163
164        if let Err(e) = query.validate() {
165            assert!(e.to_string().contains("Limit should not exceed 10,000"));
166        }
167    }
168
169    #[test]
170    fn test_validate_reasonable_limit() {
171        let query = SearchQuery::new().query("test").limit(100);
172        assert!(query.validate().is_ok());
173    }
174
175    #[test]
176    fn test_validate_max_reasonable_limit() {
177        let query = SearchQuery::new().query("test").limit(10000);
178        assert!(query.validate().is_ok());
179    }
180
181    #[test]
182    fn test_validate_very_long_query() {
183        let long_term = "a".repeat(4001);
184        let query = SearchQuery::new().query(long_term);
185        assert!(query.validate().is_err());
186
187        if let Err(e) = query.validate() {
188            assert!(e.to_string().contains("Query string is too long"));
189        }
190    }
191
192    #[test]
193    fn test_validate_unbalanced_parentheses() {
194        let query1 = SearchQuery::new()
195            .query("covid")
196            .and(SearchQuery::new().query("vaccine"));
197        // Manually create unbalanced parentheses by manipulating internal state
198        let mut broken_query = query1.clone();
199        broken_query.terms = vec!["((test".to_string()];
200
201        assert!(broken_query.validate().is_err());
202        if let Err(e) = broken_query.validate() {
203            assert!(e.to_string().contains("Unbalanced parentheses"));
204        }
205    }
206
207    #[test]
208    fn test_validate_balanced_parentheses() {
209        let query = SearchQuery::new()
210            .query("covid")
211            .and(SearchQuery::new().query("vaccine"))
212            .group();
213        assert!(query.validate().is_ok());
214    }
215
216    #[test]
217    fn test_optimize_removes_duplicates() {
218        let mut query = SearchQuery::new();
219        query.terms = vec![
220            "covid".to_string(),
221            "vaccine".to_string(),
222            "covid".to_string(),
223        ];
224        query.filters = vec![
225            "test[filter]".to_string(),
226            "other[filter]".to_string(),
227            "test[filter]".to_string(),
228        ];
229
230        let optimized = query.optimize();
231        assert_eq!(optimized.terms.len(), 2);
232        assert_eq!(optimized.filters.len(), 2);
233        assert!(optimized.terms.contains(&"covid".to_string()));
234        assert!(optimized.terms.contains(&"vaccine".to_string()));
235    }
236
237    #[test]
238    fn test_optimize_removes_empty_strings() {
239        let mut query = SearchQuery::new();
240        query.terms = vec![
241            "covid".to_string(),
242            "  ".to_string(),
243            "vaccine".to_string(),
244            "".to_string(),
245        ];
246        query.filters = vec![
247            "test[filter]".to_string(),
248            "   ".to_string(),
249            "other[filter]".to_string(),
250        ];
251
252        let optimized = query.optimize();
253        assert_eq!(optimized.terms.len(), 2);
254        assert_eq!(optimized.filters.len(), 2);
255        assert!(optimized.terms.contains(&"covid".to_string()));
256        assert!(optimized.terms.contains(&"vaccine".to_string()));
257    }
258
259    #[test]
260    fn test_optimize_sorts_terms_and_filters() {
261        let mut query = SearchQuery::new();
262        query.terms = vec![
263            "zebra".to_string(),
264            "apple".to_string(),
265            "banana".to_string(),
266        ];
267        query.filters = vec![
268            "z[filter]".to_string(),
269            "a[filter]".to_string(),
270            "b[filter]".to_string(),
271        ];
272
273        let optimized = query.optimize();
274        assert_eq!(
275            optimized.terms,
276            vec![
277                "apple".to_string(),
278                "banana".to_string(),
279                "zebra".to_string()
280            ]
281        );
282        assert_eq!(
283            optimized.filters,
284            vec![
285                "a[filter]".to_string(),
286                "b[filter]".to_string(),
287                "z[filter]".to_string()
288            ]
289        );
290    }
291
292    #[test]
293    fn test_get_stats_basic() {
294        let query = SearchQuery::new()
295            .query("covid")
296            .query("vaccine")
297            .mesh_term("Neoplasms")
298            .author("Smith");
299
300        let (term_count, filter_count, complexity) = query.get_stats();
301        assert_eq!(term_count, 2); // covid, vaccine
302        assert_eq!(filter_count, 2); // mesh term, author
303        assert!(complexity > 0);
304    }
305
306    #[test]
307    fn test_get_stats_empty_query() {
308        let query = SearchQuery::new();
309        let (term_count, filter_count, complexity) = query.get_stats();
310        assert_eq!(term_count, 0);
311        assert_eq!(filter_count, 0);
312        assert_eq!(complexity, 1); // Base complexity
313    }
314
315    #[test]
316    fn test_get_stats_complex_query() {
317        let query1 = SearchQuery::new().query("covid");
318        let query2 = SearchQuery::new().query("vaccine");
319        let complex_query = query1.and(query2).or(SearchQuery::new().query("treatment"));
320
321        let (_term_count, _filter_count, complexity) = complex_query.get_stats();
322        assert!(complexity > 3); // Should be higher due to boolean operations
323    }
324
325    #[test]
326    fn test_validate_with_filters_only() {
327        let query = SearchQuery::new().mesh_term("Neoplasms");
328        assert!(query.validate().is_ok());
329    }
330
331    #[test]
332    fn test_validate_with_terms_only() {
333        let query = SearchQuery::new().query("covid");
334        assert!(query.validate().is_ok());
335    }
336
337    #[test]
338    fn test_optimize_preserves_limit() {
339        let query = SearchQuery::new().query("test").limit(100);
340
341        let optimized = query.optimize();
342        assert_eq!(optimized.get_limit(), 100);
343    }
344
345    #[test]
346    fn test_complexity_calculation() {
347        // Test AND operation
348        let and_query = SearchQuery::new()
349            .query("a")
350            .and(SearchQuery::new().query("b"));
351        let (_, _, and_complexity) = and_query.get_stats();
352
353        // Test OR operation
354        let or_query = SearchQuery::new()
355            .query("a")
356            .or(SearchQuery::new().query("b"));
357        let (_, _, or_complexity) = or_query.get_stats();
358
359        // OR should have higher complexity than AND
360        assert!(or_complexity >= and_complexity);
361    }
362
363    #[test]
364    fn test_stats_with_nested_queries() {
365        let nested = SearchQuery::new()
366            .query("a")
367            .and(SearchQuery::new().query("b"))
368            .group();
369
370        let (_term_count, _filter_count, complexity) = nested.get_stats();
371        assert!(complexity > 2); // Should account for grouping and AND
372    }
373}