| Home | Trees | Indices | Help |
|
|---|
|
|
1 """Base classes for match providers.
2
3 They are used by business objects to give
4 phrasewheels the ability to guess phrases.
5
6 Copyright (C) GNUMed developers
7 license: GPL v2 or later
8 """
9 __version__ = "$Revision: 1.34 $"
10 __author__ = "K.Hilbert <Karsten.Hilbert@gmx.net>, I.Haywood <ihaywood@gnu.org>, S.J.Tan <sjtan@bigpond.com>"
11
12 # std lib
13 import re as regex, logging
14
15
16 # GNUmed
17 from Gnumed.pycommon import gmPG2
18
19
20 _log = logging.getLogger('gm.ui')
21 _log.info(__version__)
22
23
24 # these are stripped from the fragment passed to the
25 # match provider before looking for matches:
26 default_ignored_chars = "[?!.'\\(){}\[\]<>~#*$%^_]+" + '"'
27
28 # these are used to detect word boundaries which is,
29 # in turn, used to normalize word boundaries in the
30 # input fragment
31 default_word_separators = '[- \t=+&:@]+'
32 #============================================================
34 """Base class for match providing objects.
35
36 Match sources might be:
37 - database tables
38 - flat files
39 - previous input
40 - config files
41 - in-memory list created on the fly
42 """
43 print_queries = False
44 #--------------------------------------------------------
46 self.setThresholds()
47
48 self._context_vals = {}
49 self.__ignored_chars = regex.compile(default_ignored_chars)
50 # used to normalize word boundaries:
51 self.__word_separators = regex.compile(default_word_separators)
52 #--------------------------------------------------------
53 # actions
54 #--------------------------------------------------------
56 """Return matches according to aFragment and matching thresholds.
57
58 FIXME: design decision: we dont worry about data source changes
59 during the lifetime of a MatchProvider
60 FIXME: append _("*get all items*") on truncation
61 """
62 # sanity check
63 if aFragment is None:
64 raise ValueError, 'Cannot find matches without a fragment.'
65
66 # user explicitly wants all matches
67 if aFragment == u'*':
68 return self.getAllMatches()
69
70 # case insensitivity
71 tmpFragment = aFragment.lower()
72 # remove ignored chars
73 if self.__ignored_chars is not None:
74 tmpFragment = self.__ignored_chars.sub('', tmpFragment)
75 # normalize word separators
76 if self.__word_separators is not None:
77 tmpFragment = u' '.join(self.__word_separators.split(tmpFragment))
78 # length in number of significant characters only
79 lngFragment = len(tmpFragment)
80
81 # order is important !
82 if lngFragment >= self.__threshold_substring:
83 return self.getMatchesBySubstr(tmpFragment)
84 elif lngFragment >= self.__threshold_word:
85 return self.getMatchesByWord(tmpFragment)
86 elif lngFragment >= self.__threshold_phrase:
87 return self.getMatchesByPhrase(tmpFragment)
88 else:
89 return (False, [])
90 #--------------------------------------------------------
93 #--------------------------------------------------------
96 #--------------------------------------------------------
99 #--------------------------------------------------------
102 #--------------------------------------------------------
105 #--------------------------------------------------------
106 # configuration
107 #--------------------------------------------------------
109 """Set match location thresholds.
110
111 - the fragment passed to getMatches() must contain at least this many
112 characters before it triggers a match search at:
113 1) phrase_start - start of phrase (first word)
114 2) word_start - start of any word within phrase
115 3) in_word - _inside_ any word within phrase
116 """
117 # sanity checks
118 if aSubstring < aWord:
119 _log.error('Setting substring threshold (%s) lower than word-start threshold (%s) does not make sense. Retaining original thresholds (%s:%s, respectively).' % (aSubstring, aWord, self.__threshold_substring, self.__threshold_word))
120 return False
121 if aWord < aPhrase:
122 _log.error('Setting word-start threshold (%s) lower than phrase-start threshold (%s) does not make sense. Retaining original thresholds (%s:%s, respectively).' % (aSubstring, aWord, self.__threshold_word, self.__threshold_phrase))
123 return False
124
125 # now actually reassign thresholds
126 self.__threshold_phrase = aPhrase
127 self.__threshold_word = aWord
128 self.__threshold_substring = aSubstring
129
130 return True
131 #--------------------------------------------------------
133 if word_separators is None:
134 self.__word_separators = None
135 else:
136 self.__word_separators = regex.compile(word_separators)
137
142
143 word_separators = property(_get_word_separators, _set_word_separators)
144 #--------------------------------------------------------
146 if ignored_chars is None:
147 self.__ignored_chars = None
148 else:
149 self.__ignored_chars = regex.compile(ignored_chars)
150
155
156 ignored_chars = property(_get_ignored_chars, _set_ignored_chars)
157 #--------------------------------------------------------
159 """Set value to provide context information for matches.
160
161 The matching code may ignore it depending on its exact
162 implementation. Names and values of the context depend
163 on what is being matched.
164
165 <context> -- the *placeholder* key *inside* the context
166 definition, not the context *definition* key
167 """
168 if context is None:
169 return False
170 self._context_vals[context] = val
171 return True
172 #--------------------------------------------------------
178 #------------------------------------------------------------
179 # usable instances
180 #------------------------------------------------------------
182 """Match provider where all possible options can be held
183 in a reasonably sized, pre-allocated list.
184 """
186 """aSeq must be a list of dicts. Each dict must have the keys (data, label, weight)
187 """
188 if not type(aSeq) in [type(None), type([]), type(())]:
189 _log.error('fixed list match provider argument must be a list/tuple of dicts/None')
190 raise TypeError('fixed list match provider argument must be a list/tuple of dicts/None')
191
192 self.__items = aSeq
193 cMatchProvider.__init__(self)
194 #--------------------------------------------------------
195 # internal matching algorithms
196 #
197 # if we end up here:
198 # - aFragment will not be "None"
199 # - aFragment will be lower case
200 # - we _do_ deliver matches (whether we find any is a different story)
201 #--------------------------------------------------------
203 """Return matches for aFragment at start of phrases."""
204 matches = []
205 # look for matches
206 for item in self.__items:
207 # at start of phrase, that is
208 if item['list_label'].lower().startswith(aFragment.lower()):
209 matches.append(item)
210 # no matches found
211 if len(matches) == 0:
212 return (False, [])
213
214 matches.sort(self.__cmp_items)
215 return (True, matches)
216 #--------------------------------------------------------
218 """Return matches for aFragment at start of words inside phrases."""
219 matches = []
220 # look for matches
221 for item in self.__items:
222 item_label = item['list_label'].lower()
223 fragment_pos = item_label.find(aFragment.lower())
224 # found at start of phrase
225 if fragment_pos == 0:
226 matches.append(item)
227 # found as a true substring
228 elif fragment_pos > 0:
229 # but use only if substring is at start of a word
230 if item_label[fragment_pos-1] == u' ':
231 matches.append(item)
232 # no matches found
233 if len(matches) == 0:
234 return (False, [])
235
236 matches.sort(self.__cmp_items)
237 return (True, matches)
238 #--------------------------------------------------------
240 """Return matches for aFragment as a true substring."""
241 matches = []
242 # look for matches
243 for item in self.__items:
244 if item['list_label'].lower().find(aFragment.lower()) != -1:
245 matches.append(item)
246 # no matches found
247 if len(matches) == 0:
248 return (False, [])
249
250 matches.sort(self.__cmp_items)
251 return (True, matches)
252 #--------------------------------------------------------
254 """Return all items."""
255 matches = self.__items
256 # no matches found
257 if len(matches) == 0:
258 return (False, [])
259
260 matches.sort(self.__cmp_items)
261 return (True, matches)
262 #--------------------------------------------------------
264 """items must be a list of dicts. Each dict must have the keys (data, list_label, weight)"""
265 self.__items = items
266 #--------------------------------------------------------
277 # ===========================================================
279 """Match provider which searches matches
280 in the results of a function call.
281 """
283 """get_candidates() must return a list of strings."""
284 if get_candidates is None:
285 _log.error('must define function to retrieve match candidates list')
286 raise ValueError('must define function to retrieve match candidates list')
287
288 self._get_candidates = get_candidates
289 cMatchProvider.__init__(self)
290 #--------------------------------------------------------
291 # internal matching algorithms
292 #
293 # if we end up here:
294 # - aFragment will not be "None"
295 # - aFragment will be lower case
296 # - we _do_ deliver matches (whether we find any is a different story)
297 #--------------------------------------------------------
299 """Return matches for aFragment at start of phrases."""
300 matches = []
301 candidates = self._get_candidates()
302 # look for matches
303 for candidate in candidates:
304 # at start of phrase, that is
305 if aFragment.startswith(candidate['list_label'].lower()):
306 matches.append(candidate)
307 # no matches found
308 if len(matches) == 0:
309 return (False, [])
310
311 matches.sort(self.__cmp_candidates)
312 return (True, matches)
313 #--------------------------------------------------------
315 """Return matches for aFragment at start of words inside phrases."""
316 matches = []
317 candidates = self._get_candidates()
318 # look for matches
319 for candidate in candidates:
320 pos = candidate['list_label'].lower().find(aFragment)
321 # pos = string.find(string.lower(candidate['list_label']), aFragment)
322 # found as a true substring
323 # but use only if substring is at start of a word
324 # FIXME: use word seps
325 if (pos == 0) or (candidate['list_label'][pos-1] == u' '):
326 matches.append(candidate)
327 # no matches found
328 if len(matches) == 0:
329 return (False, [])
330
331 matches.sort(self.__cmp_candidates)
332 return (True, matches)
333 #--------------------------------------------------------
335 """Return matches for aFragment as a true substring."""
336 matches = []
337 candidates = self._get_candidates()
338 # look for matches
339 for candidate in candidates:
340 if candidate['list_label'].lower().find(aFragment) != -1:
341 # if string.find(string.lower(candidate['list_label']), aFragment) != -1:
342 matches.append(candidate)
343 # no matches found
344 if len(matches) == 0:
345 return (False, [])
346
347 matches.sort(self.__cmp_candidates)
348 return (True, matches)
349 #--------------------------------------------------------
353 #--------------------------------------------------------
357 # FIXME: do ordering
358 # if candidate1 < candidate2:
359 # return -1
360 # if candidate1 == candidate2:
361 # return 0
362 # return 1
363
364 # ===========================================================
366 """Match provider which searches matches
367 in possibly several database tables.
368
369 queries:
370 - a list of unicode strings
371 - each string is a query
372 - each string must contain: "... where <column> %(fragment_condition)s ..."
373 - each string can contain in the where clause: "... %(<context_key>)s ..."
374 - each query must return (data, label)
375
376 context definitions to be used in the queries
377 example: {'ctxt_country': {'where_part': 'and country = %(country)s', 'placeholder': 'country'}}
378
379 _SQL_data2match:
380 SQL to retrieve a match by, say, primary key
381 wherein the only argument is 'pk'
382 """
384
385 cMatchProvider.__init__(self)
386
387 if type(queries) == type([]):
388 self._queries = queries
389 else:
390 self._queries = [queries]
391
392 if context is None:
393 self._context = {}
394 else:
395 self._context = context
396
397 self._args = {}
398
399 self._SQL_data2match = None
400 #--------------------------------------------------------
401 # internal matching algorithms
402 #
403 # if we end up here:
404 # - aFragment will not be "None"
405 # - aFragment will be lower case
406 # - we _do_ deliver matches (whether we find any is a different story)
407 #--------------------------------------------------------
409 """Return matches for aFragment at start of phrases."""
410
411 fragment_condition = u"ILIKE %(fragment)s"
412 self._args['fragment'] = u"%s%%" % aFragment
413
414 return self._find_matches(fragment_condition)
415 #--------------------------------------------------------
417 """Return matches for aFragment at start of words inside phrases."""
418
419 fragment_condition = u"~* %(fragment)s"
420 aFragment = gmPG2.sanitize_pg_regex(expression = aFragment, escape_all = False)
421 self._args['fragment'] = u"( %s)|(^%s)" % (aFragment, aFragment)
422
423 return self._find_matches(fragment_condition)
424 #--------------------------------------------------------
426 """Return matches for aFragment as a true substring."""
427
428 fragment_condition = u"ILIKE %(fragment)s"
429 self._args['fragment'] = u"%%%s%%" % aFragment
430
431 return self._find_matches(fragment_condition)
432 #--------------------------------------------------------
436 #--------------------------------------------------------
438 if self._SQL_data2match is None:
439 return None
440
441 query = {'cmd': self._SQL_data2match, 'args': {'pk': data}}
442 try:
443 rows, idx = gmPG2.run_ro_queries(queries = [query], get_col_idx = False)
444 except:
445 _log.exception('[%s]: error running _SQL_data2match, dropping query', self.__class__.__name__)
446 self._SQL_data2match = None
447 return None
448
449 # hopefully the most frequent case:
450 if len(rows) == 1:
451 return rows[0]
452
453 _log.error('[%s]: 0 or >1 rows found by running _SQL_data2match, ambiguous, ignoring', self.__class__.__name__)
454 return None
455 #--------------------------------------------------------
457 if self.print_queries:
458 print "----------------------"
459 matches = []
460 for query in self._queries:
461 where_fragments = {'fragment_condition': fragment_condition}
462
463 for context_key, context_def in self._context.items():
464 try:
465 placeholder = context_def['placeholder']
466 where_part = context_def['where_part']
467 self._args[placeholder] = self._context_vals[placeholder]
468 # we do have a context value for this key, so add the where condition
469 where_fragments[context_key] = where_part
470 if self.print_queries:
471 print "ctxt ph:", placeholder
472 print "ctxt where:", where_part
473 print "ctxt val:", self._context_vals[placeholder]
474 except KeyError:
475 # we don't have a context value for this key, so skip the where condition
476 where_fragments[context_key] = u''
477 if self.print_queries:
478 print "invalid ctxt key:", context_key
479
480 cmd = query % where_fragments
481
482 if self.print_queries:
483 print "class:", self.__class__.__name__
484 print "ctxt:", self._context_vals
485 print "args:", self._args
486 print "query:", cmd
487
488 try:
489 rows, idx = gmPG2.run_ro_queries(queries = [{'cmd': cmd, 'args': self._args}], get_col_idx = False)
490 except:
491 _log.exception('[%s]: error running match provider SQL, dropping query', self.__class__.__name__)
492 idx = self._queries.index(query)
493 del self._queries[idx]
494 break
495
496 # no matches found: try next query
497 if len(rows) == 0:
498 continue
499
500 for row in rows:
501 match = {'weight': 0}
502
503 try:
504 match['data'] = row['data']
505 except KeyError:
506 match['data'] = row[0]
507
508 try:
509 match['list_label'] = row['list_label']
510 except KeyError:
511 match['list_label'] = row[1]
512
513 # explicit "field_label" in result ?
514 try:
515 match['field_label'] = row['field_label']
516 # no
517 except KeyError:
518 # but does row[2] exist ?
519 try:
520 match['field_label'] = row[2]
521 # no: reuse "list_label"
522 except IndexError:
523 match['field_label'] = match['list_label']
524
525 # try:
526 # match['label'] = row['label']
527 # except KeyError:
528 # match['label'] = match['list_label']
529
530 matches.append(match)
531
532 return (True, matches)
533
534 # none found whatsoever
535 return (False, [])
536 #================================================================
537 if __name__ == '__main__':
538 pass
539
540 #================================================================
541
| Home | Trees | Indices | Help |
|
|---|
| Generated by Epydoc 3.0.1 on Mon Dec 5 04:00:20 2011 | http://epydoc.sourceforge.net |