|
Module characterProcessing
|
|
1
2
3
4
5
6
7 import time
8 import os
9 import codecs
10 import collections
11 import re
12 from logHandler import log
13 import globalVars
14
16 """Allows access to locale-specific data objects, dynamically loading them if needed on request"""
17
19 """
20 @param localeDataFactory: the factory to create data objects for the requested locale.
21 """
22 self._localeDataFactory=localeDataFactory
23 self._dataMap={}
24
26 """
27 Fetches a data object for the given locale.
28 This may mean that the data object is first created and stored if it does not yet exist in the map.
29 The locale is also simplified (country is dropped) if the full locale can not be used to create a data object.
30 @param locale: the locale of the data object requested
31 @type locale: string
32 @return: the data object for the given locale
33 """
34 localeList=[locale]
35 if '_' in locale:
36 localeList.append(locale.split('_')[0])
37 for l in localeList:
38 data=self._dataMap.get(l)
39 if data: return data
40 try:
41 data=self._localeDataFactory(l)
42 except LookupError:
43 data=None
44 if not data: continue
45 self._dataMap[l]=data
46 return data
47 raise LookupError(locale)
48
50 """Invalidate the data object (if any) for the given locale.
51 This will cause a new data object to be created when this locale is next requested.
52 @param locale: The locale for which the data object should be invalidated.
53 @type locale: str
54 """
55 try:
56 del self._dataMap[locale]
57 except KeyError:
58 pass
59
61 """
62 Represents a map of characters to one or more descriptions (examples) for that character.
63 The data is loaded from a file from the requested locale.
64 """
65
67 """
68 @param locale: The characterDescriptions.dic file will be found by using this locale.
69 @type locale: string
70 """
71 self._entries = {}
72 fileName=os.path.join('locale',locale,'characterDescriptions.dic')
73 if not os.path.isfile(fileName):
74 raise LookupError(fileName)
75 f = codecs.open(fileName,"r","utf_8_sig",errors="replace")
76 for line in f:
77 if line.isspace() or line.startswith('#'):
78 continue
79 line=line.rstrip('\r\n')
80 temp=line.split("\t")
81 if len(temp) > 1:
82 key=temp.pop(0)
83 self._entries[key] = temp
84 else:
85 log.warning("can't parse line '%s'" % line)
86 log.debug("Loaded %d entries." % len(self._entries))
87 f.close()
88
90 """
91 Looks up the given character and returns a list containing all the description strings found.
92 """
93 return self._entries.get(character)
94
95 _charDescLocaleDataMap=LocaleDataMap(CharacterDescriptions)
96
98 """
99 Finds a description or examples for the given character, which makes sence in the given locale.
100 @param locale: the locale (language[_COUNTRY]) the description should be for.
101 @type locale: string
102 @param character: the character who's description should be retreaved.
103 @type character: string
104 @return: the found description for the given character
105 @rtype: list of strings
106 """
107 try:
108 l=_charDescLocaleDataMap.fetchLocaleData(locale)
109 except LookupError:
110 if not locale.startswith('en'):
111 return getCharacterDescription('en',character)
112 raise LookupError("en")
113 desc=l.getCharacterDescription(character)
114 if not desc and not locale.startswith('en'):
115 desc=getCharacterDescription('en',character)
116 return desc
117
118
119 SYMLVL_NONE = 0
120 SYMLVL_SOME = 100
121 SYMLVL_MOST = 200
122 SYMLVL_ALL = 300
123 SYMLVL_CHAR = 1000
124 SPEECH_SYMBOL_LEVEL_LABELS = {
125 SYMLVL_NONE: _("none"),
126 SYMLVL_SOME: _("some"),
127 SYMLVL_MOST: _("most"),
128 SYMLVL_ALL: _("all"),
129 SYMLVL_CHAR: _("character"),
130 }
131 CONFIGURABLE_SPEECH_SYMBOL_LEVELS = (SYMLVL_NONE, SYMLVL_SOME, SYMLVL_MOST, SYMLVL_ALL)
132 SPEECH_SYMBOL_LEVELS = CONFIGURABLE_SPEECH_SYMBOL_LEVELS + (SYMLVL_CHAR,)
133
134
135 SYMPRES_NEVER = 0
136 SYMPRES_ALWAYS = 1
137 SYMPRES_NOREP = 2
138
140 __slots__ = ("identifier", "pattern", "replacement", "level", "preserve", "displayName")
141
142 - def __init__(self, identifier, pattern=None, replacement=None, level=None, preserve=None, displayName=None):
143 self.identifier = identifier
144 self.pattern = pattern
145 self.replacement = replacement
146 self.level = level
147 self.preserve = preserve
148 self.displayName = displayName
149
151 attrs = []
152 for attr in self.__slots__:
153 attrs.append("{name}={val!r}".format(
154 name=attr, val=getattr(self, attr)))
155 return "SpeechSymbol(%s)" % ", ".join(attrs)
156
158 """
159 Contains raw information about the pronunciation of symbols.
160 It does not handle inheritance of data from other sources, processing of text, etc.
161 This is all handled by L{SpeechSymbolProcessor}.
162 """
163
165 """Constructor.
166 """
167 self.complexSymbols = collections.OrderedDict()
168 self.symbols = collections.OrderedDict()
169 self.fileName = None
170
171 - def load(self, fileName, allowComplexSymbols=True):
172 """Load symbol information from a file.
173 @param fileName: The name of the file from which to load symbol information.
174 @type fileName: str
175 @param allowComplexSymbols: Whether to allow complex symbols.
176 @type allowComplexSymbols: bool
177 @raise IOError: If the file cannot be read.
178 """
179 self.fileName = fileName
180 with codecs.open(fileName, "r", "utf_8_sig", errors="replace") as f:
181 handler = None
182 for line in f:
183 if line.isspace() or line.startswith("#"):
184
185 continue
186 line = line.rstrip("\r\n")
187 try:
188 if line == "complexSymbols:" and allowComplexSymbols:
189 handler = self._loadComplexSymbol
190 elif line == "symbols:":
191 handler = self._loadSymbol
192 elif handler:
193
194 handler(line)
195 else:
196 raise ValueError
197 except ValueError:
198 log.warning(u"Invalid line in file {file}: {line}".format(
199 file=fileName, line=line))
200
202 try:
203 identifier, pattern = line.split("\t")
204 except TypeError:
205 raise ValueError
206 self.complexSymbols[identifier] = pattern
207
209 if input == "-":
210
211 return None
212 if not inputMap:
213 return input
214 try:
215 return inputMap[input]
216 except KeyError:
217 raise ValueError
218
219 IDENTIFIER_ESCAPES_INPUT = {
220 "0": "\0",
221 "t": "\t",
222 "n": "\n",
223 "r": "\r",
224 "f": "\f",
225 "v": "\v",
226 "#": "#",
227 "\\": "\\",
228 }
229 IDENTIFIER_ESCAPES_OUTPUT = {v: k for k, v in IDENTIFIER_ESCAPES_INPUT.iteritems()}
230 LEVEL_INPUT = {
231 "none": SYMLVL_NONE,
232 "some": SYMLVL_SOME,
233 "most": SYMLVL_MOST,
234 "all": SYMLVL_ALL,
235 "char": SYMLVL_CHAR,
236 }
237 LEVEL_OUTPUT = {v:k for k, v in LEVEL_INPUT.iteritems()}
238 PRESERVE_INPUT = {
239 "never": SYMPRES_NEVER,
240 "always": SYMPRES_ALWAYS,
241 "norep": SYMPRES_NOREP,
242 }
243 PRESERVE_OUTPUT = {v: k for k, v in PRESERVE_INPUT.iteritems()}
244
246 line = line.split("\t")
247 identifier = replacement = level = preserve = displayName = None
248 if line[-1].startswith("#"):
249
250
251 displayName = line[-1][1:].lstrip()
252 del line[-1]
253 line = iter(line)
254 try:
255 identifier = next(line)
256 if not identifier:
257
258 raise ValueError
259 if identifier.startswith("\\") and len(identifier) >= 2:
260 identifier = self.IDENTIFIER_ESCAPES_INPUT.get(identifier[1], identifier[1]) + identifier[2:]
261 replacement = self._loadSymbolField(next(line))
262 except StopIteration:
263
264 raise ValueError
265 try:
266 level = self._loadSymbolField(next(line), self.LEVEL_INPUT)
267 preserve = self._loadSymbolField(next(line), self.PRESERVE_INPUT)
268 except StopIteration:
269
270 pass
271 self.symbols[identifier] = SpeechSymbol(identifier, None, replacement, level, preserve, displayName)
272
273 - def save(self, fileName=None):
274 """Save symbol information to a file.
275 @param fileName: The name of the file to which to save symbol information,
276 C{None} to use the file name last passed to L{load} or L{save}.
277 @type fileName: str
278 @raise IOError: If the file cannot be written.
279 @raise ValueError: If C{fileName} is C{None}
280 and L{load} or L{save} has not been called.
281 """
282 if fileName:
283 self.fileName = fileName
284 elif self.fileName:
285 fileName = self.fileName
286 else:
287 raise ValueError("No file name")
288
289 with codecs.open(fileName, "w", "utf_8_sig", errors="replace") as f:
290 if self.complexSymbols:
291 f.write(u"complexSymbols:\r\n")
292 for identifier, pattern in self.complexSymbols.iteritems():
293 f.write(u"%s\t%s\r\n" % (identifier, pattern))
294 f.write(u"\r\n")
295
296 if self.symbols:
297 f.write(u"symbols:\r\n")
298 for symbol in self.symbols.itervalues():
299 f.write(u"%s\r\n" % self._saveSymbol(symbol))
300
302 if output is None:
303 return "-"
304 if not outputMap:
305 return output
306 try:
307 return outputMap[output]
308 except KeyError:
309 raise ValueError
310
312 identifier = symbol.identifier
313 try:
314 identifier = u"\\%s%s" % (
315 self.IDENTIFIER_ESCAPES_OUTPUT[identifier[0]], identifier[1:])
316 except KeyError:
317 pass
318 fields = [identifier,
319 self._saveSymbolField(symbol.replacement),
320 self._saveSymbolField(symbol.level, self.LEVEL_OUTPUT),
321 self._saveSymbolField(symbol.preserve, self.PRESERVE_OUTPUT)
322 ]
323
324 for field in reversed(fields[2:]):
325 if field == "-":
326 del fields[-1]
327 if symbol.displayName:
328 fields.append("# %s" % symbol.displayName)
329 return u"\t".join(fields)
330
332 builtin = SpeechSymbols()
333 try:
334 builtin.load(os.path.join("locale", locale, "symbols.dic"))
335 except IOError:
336 raise LookupError("No symbol information for locale %s" % locale)
337 user = SpeechSymbols()
338 try:
339
340
341 user.load(os.path.join(globalVars.appArgs.configPath, "symbols-%s.dic" % locale),
342 allowComplexSymbols=False)
343 except IOError:
344
345 pass
346 return builtin, user
347
349 """
350 Handles processing of symbol pronunciation for a locale.
351 Pronunciation information is taken from one or more L{SpeechSymbols} instances.
352 """
353
354
355 localeSymbols = LocaleDataMap(_getSpeechSymbolsForLocale)
356
358 """Constructor.
359 @param locale: The locale for which symbol pronunciation should be processed.
360 @type locale: str
361 """
362 self.locale = locale
363
364
365 sources = self.sources = []
366 builtin, user = self.localeSymbols.fetchLocaleData(locale)
367 self.userSymbols = user
368 sources.append(user)
369 sources.append(builtin)
370
371
372 if locale != "en":
373
374 sources.append(self.localeSymbols.fetchLocaleData("en")[0])
375
376
377 symbols = self.computedSymbols = collections.OrderedDict()
378
379 complexSymbolsList = self._computedComplexSymbolsList = []
380
381 simpleSymbolIdentifiers = []
382
383 characters = set()
384
385
386 for source in sources:
387 for identifier, pattern in source.complexSymbols.iteritems():
388 if identifier in symbols:
389
390 continue
391 symbol = SpeechSymbol(identifier, pattern)
392 symbols[identifier] = symbol
393 complexSymbolsList.append(symbol)
394
395
396 for source in sources:
397 for identifier, sourceSymbol in source.symbols.iteritems():
398 try:
399 symbol = symbols[identifier]
400
401 except KeyError:
402
403
404 symbol = symbols[identifier] = SpeechSymbol(identifier)
405 simpleSymbolIdentifiers.append(identifier)
406 if len(identifier) == 1:
407 characters.add(identifier)
408
409 if symbol.replacement is None:
410 symbol.replacement = sourceSymbol.replacement
411 if symbol.level is None:
412 symbol.level = sourceSymbol.level
413 if symbol.preserve is None:
414 symbol.preserve = sourceSymbol.preserve
415 if symbol.displayName is None:
416 symbol.displayName = sourceSymbol.displayName
417
418
419 for symbol in symbols.values():
420 if symbol.replacement is None:
421
422 log.warning(u"Replacement not defined in locale {locale} for symbol: {symbol}".format(
423 symbol=symbol.identifier, locale=self.locale))
424 del symbols[symbol.identifier]
425 try:
426 complexSymbolsList.remove(symbol)
427 except ValueError:
428 pass
429 continue
430 if symbol.level is None:
431 symbol.level = SYMLVL_ALL
432 if symbol.preserve is None:
433 symbol.preserve = SYMPRES_NEVER
434 if symbol.displayName is None:
435 symbol.displayName = symbol.identifier
436
437 characters = "".join(characters)
438
439 simpleSymbolIdentifiers.sort(key=lambda identifier: len(identifier), reverse=True)
440
441
442 patterns = [
443
444 r"(?P<rstripSpace> +$)",
445
446 r"(?P<repeated>(?P<repTmp>[%s])(?P=repTmp){3,})" % re.escape("".join(characters))
447 ]
448
449
450 patterns.extend(
451 u"(?P<c{index}>{pattern})".format(index=index, pattern=symbol.pattern)
452 for index, symbol in enumerate(complexSymbolsList))
453
454
455
456 patterns.append(ur"(?P<simple>{})".format(
457 "|".join(re.escape(identifier) for identifier in simpleSymbolIdentifiers)
458 ))
459 pattern = "|".join(patterns)
460 try:
461 self._regexp = re.compile(pattern, re.UNICODE)
462 except re.error as e:
463 log.error("Invalid complex symbol regular expression in locale %s: %s" % (locale, e))
464 raise LookupError
465
467 group = m.lastgroup
468
469 if group == "rstripSpace":
470 return ""
471
472 elif group == "repeated":
473
474 text = m.group()
475 symbol = self.computedSymbols[text[0]]
476 if self._level >= symbol.level:
477 return u" {count} {char} ".format(count=len(text), char=symbol.replacement)
478 else:
479 return " "
480
481 else:
482
483 text = m.group()
484 if group == "simple":
485
486 symbol = self.computedSymbols[text]
487 else:
488
489 index = int(group[1:])
490 symbol = self._computedComplexSymbolsList[index]
491 if symbol.preserve == SYMPRES_ALWAYS or (symbol.preserve == SYMPRES_NOREP and self._level < symbol.level):
492 suffix = text
493 else:
494 suffix = " "
495 if self._level >= symbol.level and symbol.replacement:
496 return u" {repl}{suffix}".format(repl=symbol.replacement, suffix=suffix)
497 else:
498 return suffix
499
500 - def processText(self, text, level):
501 self._level = level
502 return self._regexp.sub(self._regexpRepl, text)
503
505 """Update information for a symbol if it has changed.
506 If there is a change, the changed information will be added to the user's symbol data.
507 These changes do not take effect until the symbol processor is reinitialised.
508 @param newSymbol: The symbol to update.
509 @type newSymbol: L{SpeechSymbol}
510 @return: Whether there was a change.
511 @rtype: bool
512 """
513 identifier = newSymbol.identifier
514 oldSymbol = self.computedSymbols[identifier]
515 if oldSymbol is newSymbol:
516 return False
517 try:
518 userSymbol = self.userSymbols.symbols[identifier]
519 except KeyError:
520 userSymbol = SpeechSymbol(identifier)
521
522 changed = False
523 if newSymbol.pattern != oldSymbol.pattern:
524 userSymbol.pattern = newSymbol.pattern
525 changed = True
526 if newSymbol.replacement != oldSymbol.replacement:
527 userSymbol.replacement = newSymbol.replacement
528 changed = True
529 if newSymbol.level != oldSymbol.level:
530 userSymbol.level = newSymbol.level
531 changed = True
532 if newSymbol.preserve != oldSymbol.preserve:
533 userSymbol.preserve = newSymbol.preserve
534 changed = True
535 if newSymbol.displayName != oldSymbol.displayName:
536 userSymbol.displayName = newSymbol.displayName
537 changed = True
538
539 if not changed:
540 return False
541
542
543 self.userSymbols.symbols[identifier] = userSymbol
544 return True
545
546 _localeSpeechSymbolProcessors = LocaleDataMap(SpeechSymbolProcessor)
547
549 """Process some text, converting symbols according to desired pronunciation.
550 @param locale: The locale of the text.
551 @type locale: str
552 @param text: The text to process.
553 @type text: str
554 @param level: The symbol level to use; one of the SYMLVL_* constants.
555 """
556 try:
557 ss = _localeSpeechSymbolProcessors.fetchLocaleData(locale)
558 except LookupError:
559 if not locale.startswith("en_"):
560 return processSpeechSymbols("en", text, level)
561 raise
562 return ss.processText(text, level)
563
565 """Process a single symbol according to desired pronunciation.
566 @param locale: The locale of the symbol.
567 @type locale: str
568 @param symbol: The symbol.
569 @type symbol: str
570 """
571 try:
572 ss = _localeSpeechSymbolProcessors.fetchLocaleData(locale)
573 except LookupError:
574 if not locale.startswith("en_"):
575 return processSpeechSymbol("en", symbol)
576 raise
577 try:
578 return ss.computedSymbols[symbol].replacement
579 except KeyError:
580 pass
581 return symbol
582