Module characterProcessing
[hide private]
[frames] | no frames]

Source Code for Module characterProcessing

  1  #characterProcessing.py 
  2  #A part of NonVisual Desktop Access (NVDA) 
  3  #Copyright (C) 2010-2011 NV Access Inc, World Light Information Limited, Hong Kong Blind Union 
  4  #This file is covered by the GNU General Public License. 
  5  #See the file COPYING for more details. 
  6   
  7  import time 
  8  import os 
  9  import codecs 
 10  import collections 
 11  import re 
 12  from logHandler import log 
 13  import globalVars 
 14   
15 -class LocaleDataMap(object):
16 """Allows access to locale-specific data objects, dynamically loading them if needed on request""" 17
18 - def __init__(self,localeDataFactory):
19 """ 20 @param localeDataFactory: the factory to create data objects for the requested locale. 21 """ 22 self._localeDataFactory=localeDataFactory 23 self._dataMap={}
24
25 - def fetchLocaleData(self,locale):
26 """ 27 Fetches a data object for the given locale. 28 This may mean that the data object is first created and stored if it does not yet exist in the map. 29 The locale is also simplified (country is dropped) if the full locale can not be used to create a data object. 30 @param locale: the locale of the data object requested 31 @type locale: string 32 @return: the data object for the given locale 33 """ 34 localeList=[locale] 35 if '_' in locale: 36 localeList.append(locale.split('_')[0]) 37 for l in localeList: 38 data=self._dataMap.get(l) 39 if data: return data 40 try: 41 data=self._localeDataFactory(l) 42 except LookupError: 43 data=None 44 if not data: continue 45 self._dataMap[l]=data 46 return data 47 raise LookupError(locale)
48
49 - def invalidateLocaleData(self, locale):
50 """Invalidate the data object (if any) for the given locale. 51 This will cause a new data object to be created when this locale is next requested. 52 @param locale: The locale for which the data object should be invalidated. 53 @type locale: str 54 """ 55 try: 56 del self._dataMap[locale] 57 except KeyError: 58 pass
59
60 -class CharacterDescriptions(object):
61 """ 62 Represents a map of characters to one or more descriptions (examples) for that character. 63 The data is loaded from a file from the requested locale. 64 """ 65
66 - def __init__(self,locale):
67 """ 68 @param locale: The characterDescriptions.dic file will be found by using this locale. 69 @type locale: string 70 """ 71 self._entries = {} 72 fileName=os.path.join('locale',locale,'characterDescriptions.dic') 73 if not os.path.isfile(fileName): 74 raise LookupError(fileName) 75 f = codecs.open(fileName,"r","utf_8_sig",errors="replace") 76 for line in f: 77 if line.isspace() or line.startswith('#'): 78 continue 79 line=line.rstrip('\r\n') 80 temp=line.split("\t") 81 if len(temp) > 1: 82 key=temp.pop(0) 83 self._entries[key] = temp 84 else: 85 log.warning("can't parse line '%s'" % line) 86 log.debug("Loaded %d entries." % len(self._entries)) 87 f.close()
88
89 - def getCharacterDescription(self, character):
90 """ 91 Looks up the given character and returns a list containing all the description strings found. 92 """ 93 return self._entries.get(character)
94 95 _charDescLocaleDataMap=LocaleDataMap(CharacterDescriptions) 96
97 -def getCharacterDescription(locale,character):
98 """ 99 Finds a description or examples for the given character, which makes sence in the given locale. 100 @param locale: the locale (language[_COUNTRY]) the description should be for. 101 @type locale: string 102 @param character: the character who's description should be retreaved. 103 @type character: string 104 @return: the found description for the given character 105 @rtype: list of strings 106 """ 107 try: 108 l=_charDescLocaleDataMap.fetchLocaleData(locale) 109 except LookupError: 110 if not locale.startswith('en'): 111 return getCharacterDescription('en',character) 112 raise LookupError("en") 113 desc=l.getCharacterDescription(character) 114 if not desc and not locale.startswith('en'): 115 desc=getCharacterDescription('en',character) 116 return desc
117 118 # Speech symbol levels 119 SYMLVL_NONE = 0 120 SYMLVL_SOME = 100 121 SYMLVL_MOST = 200 122 SYMLVL_ALL = 300 123 SYMLVL_CHAR = 1000 124 SPEECH_SYMBOL_LEVEL_LABELS = { 125 SYMLVL_NONE: _("none"), 126 SYMLVL_SOME: _("some"), 127 SYMLVL_MOST: _("most"), 128 SYMLVL_ALL: _("all"), 129 SYMLVL_CHAR: _("character"), 130 } 131 CONFIGURABLE_SPEECH_SYMBOL_LEVELS = (SYMLVL_NONE, SYMLVL_SOME, SYMLVL_MOST, SYMLVL_ALL) 132 SPEECH_SYMBOL_LEVELS = CONFIGURABLE_SPEECH_SYMBOL_LEVELS + (SYMLVL_CHAR,) 133 134 # Speech symbol preserve modes 135 SYMPRES_NEVER = 0 136 SYMPRES_ALWAYS = 1 137 SYMPRES_NOREP = 2 138
139 -class SpeechSymbol(object):
140 __slots__ = ("identifier", "pattern", "replacement", "level", "preserve", "displayName") 141
142 - def __init__(self, identifier, pattern=None, replacement=None, level=None, preserve=None, displayName=None):
143 self.identifier = identifier 144 self.pattern = pattern 145 self.replacement = replacement 146 self.level = level 147 self.preserve = preserve 148 self.displayName = displayName
149
150 - def __repr__(self):
151 attrs = [] 152 for attr in self.__slots__: 153 attrs.append("{name}={val!r}".format( 154 name=attr, val=getattr(self, attr))) 155 return "SpeechSymbol(%s)" % ", ".join(attrs)
156
157 -class SpeechSymbols(object):
158 """ 159 Contains raw information about the pronunciation of symbols. 160 It does not handle inheritance of data from other sources, processing of text, etc. 161 This is all handled by L{SpeechSymbolProcessor}. 162 """ 163
164 - def __init__(self):
165 """Constructor. 166 """ 167 self.complexSymbols = collections.OrderedDict() 168 self.symbols = collections.OrderedDict() 169 self.fileName = None
170
171 - def load(self, fileName, allowComplexSymbols=True):
172 """Load symbol information from a file. 173 @param fileName: The name of the file from which to load symbol information. 174 @type fileName: str 175 @param allowComplexSymbols: Whether to allow complex symbols. 176 @type allowComplexSymbols: bool 177 @raise IOError: If the file cannot be read. 178 """ 179 self.fileName = fileName 180 with codecs.open(fileName, "r", "utf_8_sig", errors="replace") as f: 181 handler = None 182 for line in f: 183 if line.isspace() or line.startswith("#"): 184 # Whitespace or comment. 185 continue 186 line = line.rstrip("\r\n") 187 try: 188 if line == "complexSymbols:" and allowComplexSymbols: 189 handler = self._loadComplexSymbol 190 elif line == "symbols:": 191 handler = self._loadSymbol 192 elif handler: 193 # This is a line within a section, so handle it according to which section we're in. 194 handler(line) 195 else: 196 raise ValueError 197 except ValueError: 198 log.warning(u"Invalid line in file {file}: {line}".format( 199 file=fileName, line=line))
200
201 - def _loadComplexSymbol(self, line):
202 try: 203 identifier, pattern = line.split("\t") 204 except TypeError: 205 raise ValueError 206 self.complexSymbols[identifier] = pattern
207
208 - def _loadSymbolField(self, input, inputMap=None):
209 if input == "-": 210 # Default. 211 return None 212 if not inputMap: 213 return input 214 try: 215 return inputMap[input] 216 except KeyError: 217 raise ValueError
218 219 IDENTIFIER_ESCAPES_INPUT = { 220 "0": "\0", 221 "t": "\t", 222 "n": "\n", 223 "r": "\r", 224 "f": "\f", 225 "v": "\v", 226 "#": "#", 227 "\\": "\\", 228 } 229 IDENTIFIER_ESCAPES_OUTPUT = {v: k for k, v in IDENTIFIER_ESCAPES_INPUT.iteritems()} 230 LEVEL_INPUT = { 231 "none": SYMLVL_NONE, 232 "some": SYMLVL_SOME, 233 "most": SYMLVL_MOST, 234 "all": SYMLVL_ALL, 235 "char": SYMLVL_CHAR, 236 } 237 LEVEL_OUTPUT = {v:k for k, v in LEVEL_INPUT.iteritems()} 238 PRESERVE_INPUT = { 239 "never": SYMPRES_NEVER, 240 "always": SYMPRES_ALWAYS, 241 "norep": SYMPRES_NOREP, 242 } 243 PRESERVE_OUTPUT = {v: k for k, v in PRESERVE_INPUT.iteritems()} 244
245 - def _loadSymbol(self, line):
246 line = line.split("\t") 247 identifier = replacement = level = preserve = displayName = None 248 if line[-1].startswith("#"): 249 # Regardless of how many fields there are, 250 # if the last field is a comment, it is the display name. 251 displayName = line[-1][1:].lstrip() 252 del line[-1] 253 line = iter(line) 254 try: 255 identifier = next(line) 256 if not identifier: 257 # Empty identifier is not allowed. 258 raise ValueError 259 if identifier.startswith("\\") and len(identifier) >= 2: 260 identifier = self.IDENTIFIER_ESCAPES_INPUT.get(identifier[1], identifier[1]) + identifier[2:] 261 replacement = self._loadSymbolField(next(line)) 262 except StopIteration: 263 # These fields are mandatory. 264 raise ValueError 265 try: 266 level = self._loadSymbolField(next(line), self.LEVEL_INPUT) 267 preserve = self._loadSymbolField(next(line), self.PRESERVE_INPUT) 268 except StopIteration: 269 # These fields are optional. Defaults will be used for unspecified fields. 270 pass 271 self.symbols[identifier] = SpeechSymbol(identifier, None, replacement, level, preserve, displayName)
272
273 - def save(self, fileName=None):
274 """Save symbol information to a file. 275 @param fileName: The name of the file to which to save symbol information, 276 C{None} to use the file name last passed to L{load} or L{save}. 277 @type fileName: str 278 @raise IOError: If the file cannot be written. 279 @raise ValueError: If C{fileName} is C{None} 280 and L{load} or L{save} has not been called. 281 """ 282 if fileName: 283 self.fileName = fileName 284 elif self.fileName: 285 fileName = self.fileName 286 else: 287 raise ValueError("No file name") 288 289 with codecs.open(fileName, "w", "utf_8_sig", errors="replace") as f: 290 if self.complexSymbols: 291 f.write(u"complexSymbols:\r\n") 292 for identifier, pattern in self.complexSymbols.iteritems(): 293 f.write(u"%s\t%s\r\n" % (identifier, pattern)) 294 f.write(u"\r\n") 295 296 if self.symbols: 297 f.write(u"symbols:\r\n") 298 for symbol in self.symbols.itervalues(): 299 f.write(u"%s\r\n" % self._saveSymbol(symbol))
300
301 - def _saveSymbolField(self, output, outputMap=None):
302 if output is None: 303 return "-" 304 if not outputMap: 305 return output 306 try: 307 return outputMap[output] 308 except KeyError: 309 raise ValueError
310
311 - def _saveSymbol(self, symbol):
312 identifier = symbol.identifier 313 try: 314 identifier = u"\\%s%s" % ( 315 self.IDENTIFIER_ESCAPES_OUTPUT[identifier[0]], identifier[1:]) 316 except KeyError: 317 pass 318 fields = [identifier, 319 self._saveSymbolField(symbol.replacement), 320 self._saveSymbolField(symbol.level, self.LEVEL_OUTPUT), 321 self._saveSymbolField(symbol.preserve, self.PRESERVE_OUTPUT) 322 ] 323 # Strip optional fields with default values. 324 for field in reversed(fields[2:]): 325 if field == "-": 326 del fields[-1] 327 if symbol.displayName: 328 fields.append("# %s" % symbol.displayName) 329 return u"\t".join(fields)
330
331 -def _getSpeechSymbolsForLocale(locale):
332 builtin = SpeechSymbols() 333 try: 334 builtin.load(os.path.join("locale", locale, "symbols.dic")) 335 except IOError: 336 raise LookupError("No symbol information for locale %s" % locale) 337 user = SpeechSymbols() 338 try: 339 # Don't allow users to specify complex symbols 340 # because an error will cause the whole processor to fail. 341 user.load(os.path.join(globalVars.appArgs.configPath, "symbols-%s.dic" % locale), 342 allowComplexSymbols=False) 343 except IOError: 344 # An empty user SpeechSymbols is okay. 345 pass 346 return builtin, user
347
348 -class SpeechSymbolProcessor(object):
349 """ 350 Handles processing of symbol pronunciation for a locale. 351 Pronunciation information is taken from one or more L{SpeechSymbols} instances. 352 """ 353 354 #: Caches symbol data for locales. 355 localeSymbols = LocaleDataMap(_getSpeechSymbolsForLocale) 356
357 - def __init__(self, locale):
358 """Constructor. 359 @param locale: The locale for which symbol pronunciation should be processed. 360 @type locale: str 361 """ 362 self.locale = locale 363 364 # We need to merge symbol data from several sources. 365 sources = self.sources = [] 366 builtin, user = self.localeSymbols.fetchLocaleData(locale) 367 self.userSymbols = user 368 sources.append(user) 369 sources.append(builtin) 370 371 # Always use English as a base. 372 if locale != "en": 373 # Only the builtin data. 374 sources.append(self.localeSymbols.fetchLocaleData("en")[0]) 375 376 # The computed symbol information from all sources. 377 symbols = self.computedSymbols = collections.OrderedDict() 378 # An indexable list of complex symbols for use in building/executing the regexp. 379 complexSymbolsList = self._computedComplexSymbolsList = [] 380 # A list of simple symbol identifiers for use in building the regexp. 381 simpleSymbolIdentifiers = [] 382 # Single character symbols. 383 characters = set() 384 385 # Add all complex symbols first, as they take priority. 386 for source in sources: 387 for identifier, pattern in source.complexSymbols.iteritems(): 388 if identifier in symbols: 389 # Already defined. 390 continue 391 symbol = SpeechSymbol(identifier, pattern) 392 symbols[identifier] = symbol 393 complexSymbolsList.append(symbol) 394 395 # Supplement the data for complex symbols and add all simple symbols. 396 for source in sources: 397 for identifier, sourceSymbol in source.symbols.iteritems(): 398 try: 399 symbol = symbols[identifier] 400 # We're updating an already existing symbol. 401 except KeyError: 402 # This is a new simple symbol. 403 # (All complex symbols have already been added.) 404 symbol = symbols[identifier] = SpeechSymbol(identifier) 405 simpleSymbolIdentifiers.append(identifier) 406 if len(identifier) == 1: 407 characters.add(identifier) 408 # If fields weren't explicitly specified, inherit the value from later sources. 409 if symbol.replacement is None: 410 symbol.replacement = sourceSymbol.replacement 411 if symbol.level is None: 412 symbol.level = sourceSymbol.level 413 if symbol.preserve is None: 414 symbol.preserve = sourceSymbol.preserve 415 if symbol.displayName is None: 416 symbol.displayName = sourceSymbol.displayName 417 418 # Set defaults for any fields not explicitly set. 419 for symbol in symbols.values(): 420 if symbol.replacement is None: 421 # Symbols without a replacement specified are useless. 422 log.warning(u"Replacement not defined in locale {locale} for symbol: {symbol}".format( 423 symbol=symbol.identifier, locale=self.locale)) 424 del symbols[symbol.identifier] 425 try: 426 complexSymbolsList.remove(symbol) 427 except ValueError: 428 pass 429 continue 430 if symbol.level is None: 431 symbol.level = SYMLVL_ALL 432 if symbol.preserve is None: 433 symbol.preserve = SYMPRES_NEVER 434 if symbol.displayName is None: 435 symbol.displayName = symbol.identifier 436 437 characters = "".join(characters) 438 # The simple symbols must be ordered longest first so that the longer symbols will match. 439 simpleSymbolIdentifiers.sort(key=lambda identifier: len(identifier), reverse=True) 440 441 # Build the regexp. 442 patterns = [ 443 # Strip repeated spaces from the end of the line to stop them from being picked up by repeated. 444 r"(?P<rstripSpace> +$)", 445 # Repeated characters: more than 3 repeats. 446 r"(?P<repeated>(?P<repTmp>[%s])(?P=repTmp){3,})" % re.escape("".join(characters)) 447 ] 448 # Complex symbols. 449 # Each complex symbol has its own named group so we know which symbol matched. 450 patterns.extend( 451 u"(?P<c{index}>{pattern})".format(index=index, pattern=symbol.pattern) 452 for index, symbol in enumerate(complexSymbolsList)) 453 # Simple symbols. 454 # These are all handled in one named group. 455 # Because the symbols are just text, we know which symbol matched just by looking at the matched text. 456 patterns.append(ur"(?P<simple>{})".format( 457 "|".join(re.escape(identifier) for identifier in simpleSymbolIdentifiers) 458 )) 459 pattern = "|".join(patterns) 460 try: 461 self._regexp = re.compile(pattern, re.UNICODE) 462 except re.error as e: 463 log.error("Invalid complex symbol regular expression in locale %s: %s" % (locale, e)) 464 raise LookupError
465
466 - def _regexpRepl(self, m):
467 group = m.lastgroup 468 469 if group == "rstripSpace": 470 return "" 471 472 elif group == "repeated": 473 # Repeated character. 474 text = m.group() 475 symbol = self.computedSymbols[text[0]] 476 if self._level >= symbol.level: 477 return u" {count} {char} ".format(count=len(text), char=symbol.replacement) 478 else: 479 return " " 480 481 else: 482 # One of the defined symbols. 483 text = m.group() 484 if group == "simple": 485 # Simple symbol. 486 symbol = self.computedSymbols[text] 487 else: 488 # Complex symbol. 489 index = int(group[1:]) 490 symbol = self._computedComplexSymbolsList[index] 491 if symbol.preserve == SYMPRES_ALWAYS or (symbol.preserve == SYMPRES_NOREP and self._level < symbol.level): 492 suffix = text 493 else: 494 suffix = " " 495 if self._level >= symbol.level and symbol.replacement: 496 return u" {repl}{suffix}".format(repl=symbol.replacement, suffix=suffix) 497 else: 498 return suffix
499
500 - def processText(self, text, level):
501 self._level = level 502 return self._regexp.sub(self._regexpRepl, text)
503
504 - def updateSymbol(self, newSymbol):
505 """Update information for a symbol if it has changed. 506 If there is a change, the changed information will be added to the user's symbol data. 507 These changes do not take effect until the symbol processor is reinitialised. 508 @param newSymbol: The symbol to update. 509 @type newSymbol: L{SpeechSymbol} 510 @return: Whether there was a change. 511 @rtype: bool 512 """ 513 identifier = newSymbol.identifier 514 oldSymbol = self.computedSymbols[identifier] 515 if oldSymbol is newSymbol: 516 return False 517 try: 518 userSymbol = self.userSymbols.symbols[identifier] 519 except KeyError: 520 userSymbol = SpeechSymbol(identifier) 521 522 changed = False 523 if newSymbol.pattern != oldSymbol.pattern: 524 userSymbol.pattern = newSymbol.pattern 525 changed = True 526 if newSymbol.replacement != oldSymbol.replacement: 527 userSymbol.replacement = newSymbol.replacement 528 changed = True 529 if newSymbol.level != oldSymbol.level: 530 userSymbol.level = newSymbol.level 531 changed = True 532 if newSymbol.preserve != oldSymbol.preserve: 533 userSymbol.preserve = newSymbol.preserve 534 changed = True 535 if newSymbol.displayName != oldSymbol.displayName: 536 userSymbol.displayName = newSymbol.displayName 537 changed = True 538 539 if not changed: 540 return False 541 542 # Do this in case the symbol wasn't in userSymbols before. 543 self.userSymbols.symbols[identifier] = userSymbol 544 return True
545 546 _localeSpeechSymbolProcessors = LocaleDataMap(SpeechSymbolProcessor) 547
548 -def processSpeechSymbols(locale, text, level):
549 """Process some text, converting symbols according to desired pronunciation. 550 @param locale: The locale of the text. 551 @type locale: str 552 @param text: The text to process. 553 @type text: str 554 @param level: The symbol level to use; one of the SYMLVL_* constants. 555 """ 556 try: 557 ss = _localeSpeechSymbolProcessors.fetchLocaleData(locale) 558 except LookupError: 559 if not locale.startswith("en_"): 560 return processSpeechSymbols("en", text, level) 561 raise 562 return ss.processText(text, level)
563
564 -def processSpeechSymbol(locale, symbol):
565 """Process a single symbol according to desired pronunciation. 566 @param locale: The locale of the symbol. 567 @type locale: str 568 @param symbol: The symbol. 569 @type symbol: str 570 """ 571 try: 572 ss = _localeSpeechSymbolProcessors.fetchLocaleData(locale) 573 except LookupError: 574 if not locale.startswith("en_"): 575 return processSpeechSymbol("en", symbol) 576 raise 577 try: 578 return ss.computedSymbols[symbol].replacement 579 except KeyError: 580 pass 581 return symbol
582