shinmeikai.py (3361B)
1 # coding: utf8 2 import re, os 3 from . import jpParser 4 from aqt import mw 5 from aqt.utils import showInfo 6 from aqt.qt import * 7 8 #parserObj = parser.Parser() 9 source_directory = os.path.join(os.path.dirname(__file__),'data') 10 11 dictFileNamePrefix = os.path.join(source_directory, "Shinmeikai", "shinmeikai_") 12 13 amountOfDictionaryFiles = 9 14 15 def getDefsOfWord(word, conf): 16 #return u"私はバカです" 17 return searchForWordInShinmeikai(word, conf) 18 19 def searchForWordInShinmeikai(searchTerm, conf): 20 dictionaryCounter = 1 21 defArray = [] 22 definitionFound = False 23 rtkKeywordList = [] 24 25 if jpParser.stringContainsKanji(searchTerm): 26 regex = "(\[\""+searchTerm+"\"(,.*?){7}\])" #With kanji 27 else: 28 regex = "(\[\".{0,8}\",\""+searchTerm+"\"(,.*?){6}\])" #Without kanji 29 30 #Looks for matches in all shinmeikai dictionary files. 31 while True: 32 dict = open(dictFileNamePrefix +str(dictionaryCounter)+".txt", "rb") 33 contents = dict.read().decode("UTF-8") 34 dict.close() 35 36 pattern = re.compile(regex, re.UNICODE) 37 export = pattern.findall(contents) 38 #return pattern.pattern 39 #return str(export).decode('unicode-escape') 40 if len(export) >= 1: 41 definitionFound = True 42 for x in export: 43 #after = x 44 #after.encode("UTF-8") 45 #return str(x).decode('unicode-escape') 46 ########### innerDef = extractDefFromElement(str(x).decode('unicode-escape')) 47 innerDef = extractDefFromElement(str(x)) 48 hira = extractHiraFromElement(innerDef) 49 kanji = extractKanjiFromElement(innerDef) 50 51 if kanji == None:#If word doesnt have kanji, search with hira for both fields 52 kanji = hira 53 freq = jpParser.getWordFreq(hira, kanji) 54 55 defArray.append((innerDef, int(freq))) 56 if dictionaryCounter >= amountOfDictionaryFiles: 57 if definitionFound == False: 58 defArray = "" 59 break 60 dictionaryCounter += 1 61 return (defArray) 62 63 def extractDefFromElement(org): 64 regex = "(.*)\[.*\[\"(.*?)\"\]" 65 matchObj = re.match(regex, org) 66 if matchObj == None: 67 returnValue = None 68 else: 69 returnValue = matchObj.group(2) 70 returnValue = re.sub(r'\\\\', r'\\', returnValue) 71 return returnValue 72 73 def extractHiraFromElement(org): 74 regex = "(.*?) " 75 matchObj = re.match(regex, org) 76 if matchObj == None: 77 returnValue = None 78 else: 79 returnValue = matchObj.group(1) 80 return returnValue 81 82 83 def extractKanjiFromElement(org): 84 regex = ".*?【(.*?)】" 85 matchObj = re.match(regex, org) 86 if matchObj == None: 87 return None 88 else: 89 returnValue = matchObj.group(1) 90 91 #If more than one kanji, ex(内・家) for うち 92 #return an array of kanji, It seems like the "・" character doesnt work with regex? 93 #Currently only handles two different words, like うち -> 家 and 内 works 94 matchObj = re.search("・", returnValue) 95 if matchObj != None: #Multiple kanji found 96 matchObj = re.match("(.*?)・(.*?) ", returnValue+" ") 97 if matchObj != None: 98 returnValue = [] 99 returnValue.append(matchObj.group(1)) 100 returnValue.append(matchObj.group(2)) 101 return returnValue