| Home | Trees | Indices | Help | 
 | 
|---|
|  | 
  1  """Base classes for match providers. 
  2   
  3  They are used by business objects to give 
  4  phrasewheels the ability to guess phrases. 
  5   
  6  Copyright (C) GNUMed developers 
  7  license: GPL v2 or later 
  8  """ 
  9  __version__ = "$Revision: 1.34 $" 
 10  __author__  = "K.Hilbert <Karsten.Hilbert@gmx.net>, I.Haywood <ihaywood@gnu.org>, S.J.Tan <sjtan@bigpond.com>" 
 11   
 12  # std lib 
 13  import re as regex, logging 
 14   
 15   
 16  # GNUmed 
 17  from Gnumed.pycommon import gmPG2 
 18   
 19   
 20  _log = logging.getLogger('gm.ui') 
 21  _log.info(__version__) 
 22   
 23   
 24  # these are stripped from the fragment passed to the 
 25  # match provider before looking for matches: 
 26  default_ignored_chars = "[?!.'\\(){}\[\]<>~#*$%^_]+" + '"' 
 27   
 28  # these are used to detect word boundaries which is, 
 29  # in turn, used to normalize word boundaries in the 
 30  # input fragment 
 31  default_word_separators = '[- \t=+&:@]+' 
 32  #============================================================ 
 34          """Base class for match providing objects. 
 35   
 36          Match sources might be: 
 37          - database tables 
 38          - flat files 
 39          - previous input 
 40          - config files 
 41          - in-memory list created on the fly 
 42          """ 
 43          print_queries = False 
 44          #-------------------------------------------------------- 
 46                  self.setThresholds() 
 47   
 48                  self._context_vals = {} 
 49                  self.__ignored_chars = regex.compile(default_ignored_chars) 
 50                  # used to normalize word boundaries: 
 51                  self.__word_separators = regex.compile(default_word_separators) 
 52          #-------------------------------------------------------- 
 53          # actions 
 54          #-------------------------------------------------------- 
 56                  """Return matches according to aFragment and matching thresholds. 
 57   
 58                  FIXME: design decision: we dont worry about data source changes 
 59                             during the lifetime of a MatchProvider 
 60                  FIXME: append _("*get all items*") on truncation 
 61                  """ 
 62                  # sanity check 
 63                  if aFragment is None: 
 64                          raise ValueError, 'Cannot find matches without a fragment.' 
 65   
 66                  # user explicitly wants all matches 
 67                  if aFragment == u'*': 
 68                          return self.getAllMatches() 
 69   
 70                  # case insensitivity 
 71                  tmpFragment = aFragment.lower() 
 72                  # remove ignored chars 
 73                  if self.__ignored_chars is not None: 
 74                          tmpFragment = self.__ignored_chars.sub('', tmpFragment) 
 75                  # normalize word separators 
 76                  if self.__word_separators is not None: 
 77                          tmpFragment = u' '.join(self.__word_separators.split(tmpFragment)) 
 78                  # length in number of significant characters only 
 79                  lngFragment = len(tmpFragment) 
 80   
 81                  # order is important ! 
 82                  if lngFragment >= self.__threshold_substring: 
 83                          return self.getMatchesBySubstr(tmpFragment) 
 84                  elif lngFragment >= self.__threshold_word: 
 85                          return self.getMatchesByWord(tmpFragment) 
 86                  elif lngFragment >= self.__threshold_phrase: 
 87                          return self.getMatchesByPhrase(tmpFragment) 
 88                  else: 
 89                          return (False, []) 
 90          #-------------------------------------------------------- 
 93          #-------------------------------------------------------- 
 96          #-------------------------------------------------------- 
 99          #-------------------------------------------------------- 
102          #-------------------------------------------------------- 
105          #-------------------------------------------------------- 
106          # configuration 
107          #-------------------------------------------------------- 
109                  """Set match location thresholds. 
110   
111                  - the fragment passed to getMatches() must contain at least this many 
112                    characters before it triggers a match search at: 
113                    1) phrase_start - start of phrase (first word) 
114                    2) word_start - start of any word within phrase 
115                    3) in_word - _inside_ any word within phrase 
116                  """ 
117                  # sanity checks 
118                  if aSubstring < aWord: 
119                          _log.error('Setting substring threshold (%s) lower than word-start threshold (%s) does not make sense. Retaining original thresholds (%s:%s, respectively).' % (aSubstring, aWord, self.__threshold_substring, self.__threshold_word)) 
120                          return False 
121                  if aWord < aPhrase: 
122                          _log.error('Setting word-start threshold (%s) lower than phrase-start threshold (%s) does not make sense. Retaining original thresholds (%s:%s, respectively).' % (aSubstring, aWord, self.__threshold_word, self.__threshold_phrase)) 
123                          return False 
124   
125                  # now actually reassign thresholds 
126                  self.__threshold_phrase = aPhrase 
127                  self.__threshold_word   = aWord 
128                  self.__threshold_substring      = aSubstring 
129   
130                  return True 
131          #-------------------------------------------------------- 
133                  if word_separators is None: 
134                          self.__word_separators = None 
135                  else: 
136                          self.__word_separators = regex.compile(word_separators) 
137   
142   
143          word_separators = property(_get_word_separators, _set_word_separators) 
144          #-------------------------------------------------------- 
146                  if ignored_chars is None: 
147                          self.__ignored_chars = None 
148                  else: 
149                          self.__ignored_chars = regex.compile(ignored_chars) 
150   
155   
156          ignored_chars = property(_get_ignored_chars, _set_ignored_chars) 
157          #-------------------------------------------------------- 
159                  """Set value to provide context information     for matches. 
160   
161                  The matching code may ignore it depending on its exact 
162                  implementation. Names and values of the context depend 
163                  on what is being matched. 
164   
165                  <context> -- the *placeholder* key *inside* the context 
166                                           definition, not the context *definition* key 
167                  """ 
168                  if context is None: 
169                          return False 
170                  self._context_vals[context] = val 
171                  return True 
172          #-------------------------------------------------------- 
178  #------------------------------------------------------------ 
179  # usable instances 
180  #------------------------------------------------------------ 
182          """Match provider where all possible options can be held 
183             in a reasonably sized, pre-allocated list. 
184          """ 
186                  """aSeq must be a list of dicts. Each dict must have the keys (data, label, weight) 
187                  """ 
188                  if not type(aSeq) in [type(None), type([]), type(())]: 
189                          _log.error('fixed list match provider argument must be a list/tuple of dicts/None') 
190                          raise TypeError('fixed list match provider argument must be a list/tuple of dicts/None') 
191   
192                  self.__items = aSeq 
193                  cMatchProvider.__init__(self) 
194          #-------------------------------------------------------- 
195          # internal matching algorithms 
196          # 
197          # if we end up here: 
198          #       - aFragment will not be "None" 
199          #   - aFragment will be lower case 
200          #       - we _do_ deliver matches (whether we find any is a different story) 
201          #-------------------------------------------------------- 
203                  """Return matches for aFragment at start of phrases.""" 
204                  matches = [] 
205                  # look for matches 
206                  for item in self.__items: 
207                          # at start of phrase, that is 
208                          if item['list_label'].lower().startswith(aFragment.lower()): 
209                                  matches.append(item) 
210                  # no matches found 
211                  if len(matches) == 0: 
212                          return (False, []) 
213   
214                  matches.sort(self.__cmp_items) 
215                  return (True, matches) 
216          #-------------------------------------------------------- 
218                  """Return matches for aFragment at start of words inside phrases.""" 
219                  matches = [] 
220                  # look for matches 
221                  for item in self.__items: 
222                          item_label = item['list_label'].lower() 
223                          fragment_pos = item_label.find(aFragment.lower()) 
224                          # found at start of phrase 
225                          if fragment_pos == 0: 
226                                  matches.append(item) 
227                          # found as a true substring 
228                          elif fragment_pos > 0: 
229                                  # but use only if substring is at start of a word 
230                                  if item_label[fragment_pos-1] == u' ': 
231                                          matches.append(item) 
232                  # no matches found 
233                  if len(matches) == 0: 
234                          return (False, []) 
235   
236                  matches.sort(self.__cmp_items) 
237                  return (True, matches) 
238          #-------------------------------------------------------- 
240                  """Return matches for aFragment as a true substring.""" 
241                  matches = [] 
242                  # look for matches 
243                  for item in self.__items: 
244                          if item['list_label'].lower().find(aFragment.lower()) != -1: 
245                                  matches.append(item) 
246                  # no matches found 
247                  if len(matches) == 0: 
248                          return (False, []) 
249   
250                  matches.sort(self.__cmp_items) 
251                  return (True, matches) 
252          #-------------------------------------------------------- 
254                  """Return all items.""" 
255                  matches = self.__items 
256                  # no matches found 
257                  if len(matches) == 0: 
258                          return (False, []) 
259   
260                  matches.sort(self.__cmp_items) 
261                  return (True, matches) 
262          #-------------------------------------------------------- 
264                  """items must be a list of dicts. Each dict must have the keys (data, list_label, weight)""" 
265                  self.__items = items 
266          #-------------------------------------------------------- 
277  # =========================================================== 
279          """Match provider which searches matches 
280             in the results of a function call. 
281          """ 
283                  """get_candidates() must return a list of strings.""" 
284                  if get_candidates is None: 
285                          _log.error('must define function to retrieve match candidates list') 
286                          raise ValueError('must define function to retrieve match candidates list') 
287   
288                  self._get_candidates = get_candidates 
289                  cMatchProvider.__init__(self) 
290          #-------------------------------------------------------- 
291          # internal matching algorithms 
292          # 
293          # if we end up here: 
294          #       - aFragment will not be "None" 
295          #   - aFragment will be lower case 
296          #       - we _do_ deliver matches (whether we find any is a different story) 
297          #-------------------------------------------------------- 
299                  """Return matches for aFragment at start of phrases.""" 
300                  matches = [] 
301                  candidates = self._get_candidates() 
302                  # look for matches 
303                  for candidate in candidates: 
304                          # at start of phrase, that is 
305                          if aFragment.startswith(candidate['list_label'].lower()): 
306                                  matches.append(candidate) 
307                  # no matches found 
308                  if len(matches) == 0: 
309                          return (False, []) 
310   
311                  matches.sort(self.__cmp_candidates) 
312                  return (True, matches) 
313          #-------------------------------------------------------- 
315                  """Return matches for aFragment at start of words inside phrases.""" 
316                  matches = [] 
317                  candidates = self._get_candidates() 
318                  # look for matches 
319                  for candidate in candidates: 
320                          pos = candidate['list_label'].lower().find(aFragment) 
321  #                       pos = string.find(string.lower(candidate['list_label']), aFragment) 
322                          # found as a true substring 
323                          # but use only if substring is at start of a word 
324                          # FIXME: use word seps 
325                          if (pos == 0) or (candidate['list_label'][pos-1] == u' '): 
326                                  matches.append(candidate) 
327                  # no matches found 
328                  if len(matches) == 0: 
329                          return (False, []) 
330   
331                  matches.sort(self.__cmp_candidates) 
332                  return (True, matches) 
333          #-------------------------------------------------------- 
335                  """Return matches for aFragment as a true substring.""" 
336                  matches = [] 
337                  candidates = self._get_candidates() 
338                  # look for matches 
339                  for candidate in candidates: 
340                          if candidate['list_label'].lower().find(aFragment) != -1: 
341  #                       if string.find(string.lower(candidate['list_label']), aFragment) != -1: 
342                                  matches.append(candidate) 
343                  # no matches found 
344                  if len(matches) == 0: 
345                          return (False, []) 
346   
347                  matches.sort(self.__cmp_candidates) 
348                  return (True, matches) 
349          #-------------------------------------------------------- 
353          #-------------------------------------------------------- 
357                  # FIXME: do ordering 
358  #               if candidate1 < candidate2: 
359  #                       return -1 
360  #               if candidate1 == candidate2: 
361  #                       return 0 
362  #               return 1 
363   
364  # =========================================================== 
366          """Match provider which searches matches 
367             in possibly several database tables. 
368   
369          queries: 
370                  - a list of unicode strings 
371                  - each string is a query 
372                  - each string must contain: "... where <column> %(fragment_condition)s ..." 
373                  - each string can contain in the where clause: "... %(<context_key>)s ..." 
374                  - each query must return (data, label) 
375   
376          context definitions to be used in the queries 
377          example: {'ctxt_country': {'where_part': 'and country = %(country)s', 'placeholder': 'country'}} 
378   
379          _SQL_data2match: 
380                  SQL to retrieve a match by, say, primary key 
381                  wherein the only argument is 'pk' 
382          """ 
384   
385                  cMatchProvider.__init__(self) 
386   
387                  if type(queries) == type([]): 
388                          self._queries = queries 
389                  else: 
390                          self._queries = [queries] 
391   
392                  if context is None: 
393                          self._context = {} 
394                  else: 
395                          self._context = context 
396   
397                  self._args = {} 
398   
399                  self._SQL_data2match = None 
400          #-------------------------------------------------------- 
401          # internal matching algorithms 
402          # 
403          # if we end up here: 
404          #       - aFragment will not be "None" 
405          #   - aFragment will be lower case 
406          #       - we _do_ deliver matches (whether we find any is a different story) 
407          #-------------------------------------------------------- 
409                  """Return matches for aFragment at start of phrases.""" 
410   
411                  fragment_condition = u"ILIKE %(fragment)s" 
412                  self._args['fragment'] = u"%s%%" % aFragment 
413   
414                  return self._find_matches(fragment_condition) 
415          #-------------------------------------------------------- 
417                  """Return matches for aFragment at start of words inside phrases.""" 
418   
419                  fragment_condition = u"~* %(fragment)s" 
420                  aFragment = gmPG2.sanitize_pg_regex(expression = aFragment, escape_all = False) 
421                  self._args['fragment'] = u"( %s)|(^%s)" % (aFragment, aFragment) 
422   
423                  return self._find_matches(fragment_condition) 
424          #-------------------------------------------------------- 
426                  """Return matches for aFragment as a true substring.""" 
427   
428                  fragment_condition = u"ILIKE %(fragment)s" 
429                  self._args['fragment'] = u"%%%s%%" % aFragment 
430   
431                  return self._find_matches(fragment_condition) 
432          #-------------------------------------------------------- 
436          #-------------------------------------------------------- 
438                  if self._SQL_data2match is None: 
439                          return None 
440   
441                  query = {'cmd': self._SQL_data2match, 'args': {'pk': data}} 
442                  try: 
443                          rows, idx = gmPG2.run_ro_queries(queries = [query], get_col_idx = False) 
444                  except: 
445                          _log.exception('[%s]: error running _SQL_data2match, dropping query', self.__class__.__name__) 
446                          self._SQL_data2match = None 
447                          return None 
448   
449                  # hopefully the most frequent case: 
450                  if len(rows) == 1: 
451                          return rows[0] 
452   
453                  _log.error('[%s]: 0 or >1 rows found by running _SQL_data2match, ambiguous, ignoring', self.__class__.__name__) 
454                  return None 
455          #-------------------------------------------------------- 
457                  if self.print_queries: 
458                          print "----------------------" 
459                  matches = [] 
460                  for query in self._queries: 
461                          where_fragments = {'fragment_condition': fragment_condition} 
462   
463                          for context_key, context_def in self._context.items(): 
464                                  try: 
465                                          placeholder = context_def['placeholder'] 
466                                          where_part = context_def['where_part'] 
467                                          self._args[placeholder] = self._context_vals[placeholder] 
468                                          # we do have a context value for this key, so add the where condition 
469                                          where_fragments[context_key] = where_part 
470                                          if self.print_queries: 
471                                                  print "ctxt ph:", placeholder 
472                                                  print "ctxt where:", where_part 
473                                                  print "ctxt val:", self._context_vals[placeholder] 
474                                  except KeyError: 
475                                          # we don't have a context value for this key, so skip the where condition 
476                                          where_fragments[context_key] = u'' 
477                                          if self.print_queries: 
478                                                  print "invalid ctxt key:", context_key 
479   
480                          cmd = query % where_fragments 
481   
482                          if self.print_queries: 
483                                  print "class:", self.__class__.__name__ 
484                                  print "ctxt:", self._context_vals 
485                                  print "args:", self._args 
486                                  print "query:", cmd 
487   
488                          try: 
489                                  rows, idx = gmPG2.run_ro_queries(queries = [{'cmd': cmd, 'args': self._args}], get_col_idx = False) 
490                          except: 
491                                  _log.exception('[%s]: error running match provider SQL, dropping query', self.__class__.__name__) 
492                                  idx = self._queries.index(query) 
493                                  del self._queries[idx] 
494                                  break 
495   
496                          # no matches found: try next query 
497                          if len(rows) == 0: 
498                                  continue 
499   
500                          for row in rows: 
501                                  match = {'weight': 0} 
502   
503                                  try: 
504                                          match['data'] = row['data'] 
505                                  except KeyError: 
506                                          match['data'] = row[0] 
507   
508                                  try: 
509                                          match['list_label'] = row['list_label'] 
510                                  except KeyError: 
511                                          match['list_label'] = row[1] 
512   
513                                  # explicit "field_label" in result ? 
514                                  try: 
515                                          match['field_label'] = row['field_label'] 
516                                  # no 
517                                  except KeyError: 
518                                          # but does row[2] exist ? 
519                                          try: 
520                                                  match['field_label'] = row[2] 
521                                          # no: reuse "list_label" 
522                                          except IndexError: 
523                                                  match['field_label'] = match['list_label'] 
524   
525  #                               try: 
526  #                                       match['label'] = row['label'] 
527  #                               except KeyError: 
528  #                                       match['label'] = match['list_label'] 
529   
530                                  matches.append(match) 
531   
532                          return (True, matches) 
533   
534                  # none found whatsoever 
535                  return (False, []) 
536  #================================================================ 
537  if __name__ == '__main__': 
538          pass 
539   
540  #================================================================ 
541   
| Home | Trees | Indices | Help | 
 | 
|---|
| Generated by Epydoc 3.0.1 on Mon Dec 5 04:00:20 2011 | http://epydoc.sourceforge.net |