404

#!/usr/bin/python3
# -*- coding: utf-8 -*-
# vim:et sts=4 sw=4
#
# ibus-table - The Tables engine for IBus
#
# Copyright (c) 2008-2009 Yu Yuwei <acevery@gmail.com>
# Copyright (c) 2009-2014 Caius "kaio" CHANCE <me@kaio.net>
# Copyright (c) 2012-2015 Mike FABIAN <mfabian@redhat.com>
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
#

import os
import sys
import bz2
import re
from optparse import OptionParser
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
import tabsqlitedb

_INVALID_KEYNAME_CHARS = " \t\r\n\"$&<>,+=#!()'|{}[]?~`;%\\"

def gconf_valid_keyname(keyname):
    """
    Keynames must be ascii, and must not contain any invalid characters

    >>> gconf_valid_keyname('nyannyan')
    True

    >>> gconf_valid_keyname('nyan nyan')
    False

    >>> gconf_valid_keyname('nyannyan[')
    False

    >>> gconf_valid_keyname('nyan\tnyan')
    False
    """
    return not any(char in _INVALID_KEYNAME_CHARS or ord(char) > 127
                   for char in keyname)

class InvalidTableName(Exception):
    """
    Raised when an invalid table name is given
    """
    def __init__(self, name):
        super(InvalidTableName, self).__init__()
        self.table_name = name

    def __str__(self):
        return ('Value of NAME attribute (%s) ' % self.table_name
                + 'cannot contain any of %r ' % _INVALID_KEYNAME_CHARS
                + 'and must be all ascii')

# we use OptionParser to parse the cmd arguments :)
_OPTION_PARSER = OptionParser(usage="usage: %prog [options]")

_OPTION_PARSER.add_option(
    '-n', '--name',
    action='store',
    dest='name',
    default='',
    help=(
        'specifies the file name for the binary database for the IME. '
        + 'The default is "%default". If the file name of the database '
        + 'is not specified, the file name of the source file before '
        + 'the first "." will be appended with ".db" and that will be '
        + 'used as the file name of the database.'))

_OPTION_PARSER.add_option(
    '-s', '--source',
    action='store',
    dest='source',
    default='',
    help=(
        'specifies the file which contains the source of the IME. '
        + 'The default is "%default".'))

_OPTION_PARSER.add_option(
    '-e', '--extra',
    action='store',
    dest='extra',
    default='',
    help=(
        'specifies the file name for the extra words for the IME. '
        + 'The default is "%default".'))

_OPTION_PARSER.add_option(
    '-p', '--pinyin',
    action='store',
    dest='pinyin',
    default='/usr/share/ibus-table/data/pinyin_table.txt.bz2',
    help=(
        'specifies the source file for the  pinyin. '
        + 'The default is "%default".'))

_OPTION_PARSER.add_option(
    '-o', '--no-create-index',
    action='store_false',
    dest='index',
    default=True,
    help=(
        'Do not create an index for a database '
        + '(Only for distrubution purposes, '
        + 'a normal user should not use this flag!)'))

_OPTION_PARSER.add_option(
    '-i', '--create-index-only',
    action='store_true',
    dest='only_index',
    default=False,
    help=(
        'Only create an index for an existing database. '
        + 'Specifying the file name of the binary database '
        + 'with the -n or --name option is required '
        + 'when this option is used.'))

_OPTION_PARSER.add_option(
    '-d', '--debug',
    action='store_true',
    dest='debug',
    default=False,
    help='Print extra debug messages.')

(_OPTIONS, _ARGS) = _OPTION_PARSER.parse_args()
if _OPTIONS.only_index:
    if not _OPTIONS.name:
        _OPTION_PARSER.print_help()
        print(
            '\nPlease specify the file name of the database '
            + 'you want to create an index on!')
        sys.exit(2)
    if not os.path.exists(_OPTIONS.name) or not os.path.isfile(_OPTIONS.name):
        _OPTION_PARSER.print_help()
        print("\nThe database file '%s' does not exist." % _OPTIONS.name)
        sys.exit(2)

if not _OPTIONS.name and _OPTIONS.source:
    _OPTIONS.name = os.path.basename(_OPTIONS.source).split('.')[0] + '.db'

if not _OPTIONS.name:
    _OPTION_PARSER.print_help()
    print(
        '\nYou need to specify the file which '
        + 'contains the source of the IME!')
    sys.exit(2)

def main():
    def debug_print(message):
        if _OPTIONS.debug:
            print(message)

    if not _OPTIONS.only_index:
        try:
            os.unlink(_OPTIONS.name)
        except:
            pass

    debug_print('Processing Database')
    db = tabsqlitedb.TabSqliteDb(filename=_OPTIONS.name,
                                 user_db=None,
                                 create_database=True)

    def parse_source(f):
        _attri = []
        _table = []
        _gouci = []
        patt_com = re.compile(r'^###.*')
        patt_blank = re.compile(r'^[ \t]*$')
        patt_conf = re.compile(r'[^\t]*=[^\t]*')
        patt_table = re.compile(r'([^\t]+)\t([^\t]+)\t([0-9]+)(\t.*)?$')
        patt_gouci = re.compile(r' *[^\s]+ *\t *[^\s]+ *$')

        for line in f:
            if (not patt_com.match(line)) and (not patt_blank.match(line)):
                for _patt, _list in (
                        (patt_table, _table),
                        (patt_gouci, _gouci),
                        (patt_conf, _attri)):
                    if _patt.match(line):
                        _list.append(line)
                        break

        if not _gouci:
            # The user didn’t provide goucima (goucima = 構詞碼 =
            # “word formation keys”) in the table source, so we use
            # the longest encoding for a single character as the
            # goucima for that character.
            #
            # Example:
            #
            # wubi-jidian86.txt contains:
            #
            #     a         工      99454797
            #     aaa	工      551000000
            #     aaaa      工      551000000
            #     aaad      工期    5350000
            #     ... and more matches for compounds containing 工
            #
            # The longest key sequence to type 工 as a single
            # character is “aaaa”.  Therefore, the goucima of 工 is
            # “aaaa” (There is one other character with the same goucima
            # in  wubi-jidian86.txt, 㠭 also has the goucima “aaaa”).
            gouci_dict = {}
            for line in _table:
                res = patt_table.match(line)
                if res and len(res.group(2)) == 1:
                    if res.group(2) in gouci_dict:
                        if len(res.group(1)) > len(gouci_dict[res.group(2)]):
                            gouci_dict[res.group(2)] = res.group(1)
                    else:
                        gouci_dict[res.group(2)] = res.group(1)
            for key in gouci_dict:
                _gouci.append('%s\t%s' %(key, gouci_dict[key]))
            _gouci.sort()

        return (_attri, _table, _gouci)

    def parse_pinyin(f):
        _pinyins = []
        patt_com = re.compile(r'^#.*')
        patt_blank = re.compile(r'^[ \t]*$')
        patt_py = re.compile(r'(.*)\t(.*)\t(.*)')
        patt_yin = re.compile(r'[a-z]+[1-5]')

        for line in f:
            if type(line) != type(u''):
                line = line.decode('utf-8')
            if (not patt_com.match(line)) and (not patt_blank.match(line)):
                res = patt_py.match(line)
                if res:
                    yins = patt_yin.findall(res.group(2))
                    for yin in yins:
                        _pinyins.append("%s\t%s\t%s" \
                                % (res.group(1), yin, res.group(3)))
        return _pinyins[:]

    def parse_extra(f):
        _extra = []
        patt_com = re.compile(r'^###.*')
        patt_blank = re.compile(r'^[ \t]*$')
        patt_extra = re.compile(r'(.*)\t(.*)')

        for line in f:
            if type(line) != type(u''):
                line = line.decode('utf-8')
            if (not patt_com.match(line)) and (not patt_blank.match(line)):
                if patt_extra.match(line):
                    _extra.append(line)

        return _extra

    def pinyin_parser(f):
        for pinyin_line in f:
            if type(pinyin_line) != type(u''):
                pinyin_line = pinyin_line.decode('utf-8')
            _zi, _pinyin, _freq = pinyin_line.strip().split()
            yield (_pinyin, _zi, _freq)

    def phrase_parser(f):
        phrase_list = []
        for line in f:
            if type(line) != type(u''):
                line = line.decode('utf-8')
            xingma, phrase, freq = line.split('\t')[:3]
            if phrase == 'NOSYMBOL':
                phrase = u''
            phrase_list.append((xingma, phrase, int(freq), 0))
        return phrase_list

    def goucima_parser(f):
        for line in f:
            if type(line) != type(u''):
                line = line.decode('utf-8')
            zi, gcm = line.strip().split()
            yield (zi, gcm)

    def attribute_parser(f):
        for line in f:
            if type(line) != type(u''):
                line = line.decode('utf-8')
            try:
                attr, val = line.strip().split('=')
            except:
                attr, val = line.strip().split('==')
            attr = attr.strip().lower()
            val = val.strip()
            yield (attr, val)

    def extra_parser(f):
        extra_list = []
        for line in f:
            if type(line) != type(u''):
                line = line.decode('utf-8')
            phrase, freq = line.strip().split()
            _tabkey = db.parse_phrase(phrase)
            if _tabkey:
                extra_list.append((_tabkey, phrase, freq, 0))
            else:
                print('No tabkeys found for “%s”, not adding.\n' %phrase)
        return extra_list

    def get_char_prompts(f):
        '''
        Returns something like

        ("char_prompts", "{'a': '日', 'b': '日', 'c': '金', ...}")

        i.e. the attribute name "char_prompts" and as its value
        the string representation of a Python dictionary.
        '''
        char_prompts = {}
        start = False
        for line in f:
            if type(line) != type(u''):
                line = line.decode('utf-8')
            if re.match(r'^BEGIN_CHAR_PROMPTS_DEFINITION', line):
                start = True
                continue
            if not start:
                continue
            if re.match(r'^END_CHAR_PROMPTS_DEFINITION', line):
                break
            match = re.search(
                r'^(?P<char>[^\s]+)[\s]+(?P<prompt>[^\s]+)', line)
            if match:
                char_prompts[match.group('char')] = match.group('prompt')
        return ("char_prompts", repr(char_prompts))

    if _OPTIONS.only_index:
        debug_print('Only create Indexes')
        debug_print('Optimizing database ')
        db.optimize_database()

        debug_print('Create Indexes ')
        db.create_indexes('main')
        debug_print('Done! :D')
        return 0

    # now we parse the ime source file
    debug_print('\tLoad sources "%s"' % _OPTIONS.source)
    patt_s = re.compile(r'.*\.bz2')
    _bz2s = patt_s.match(_OPTIONS.source)
    if _bz2s:
        source = bz2.BZ2File(_OPTIONS.source, "r").read()
    else:
        source = open(_OPTIONS.source, mode='r', encoding='UTF-8').read()
    source = source.replace('\r\n', '\n')
    source = source.split('\n')
    # first get config line and table line and goucima line respectively
    debug_print('\tParsing table source file ')
    attri, table, gouci = parse_source(source)

    debug_print('\t  get attribute of IME :)')
    attributes = list(attribute_parser(attri))
    attributes.append(get_char_prompts(source))
    debug_print('\t  add attributes into DB ')
    db.update_ime(attributes)
    db.create_tables('main')

    # second, we use generators for database generating:
    debug_print('\t  get phrases of IME :)')
    phrases = phrase_parser(table)

    # now we add things into db
    debug_print('\t  add phrases into DB ')
    db.add_phrases(phrases)

    if db.ime_properties.get('user_can_define_phrase').lower() == u'true':
        debug_print('\t  get goucima of IME :)')
        goucima = goucima_parser(gouci)
        debug_print('\t  add goucima into DB ')
        db.add_goucima(goucima)

    if db.ime_properties.get('pinyin_mode').lower() == u'true':
        debug_print('\tLoad pinyin source \"%s\"' % _OPTIONS.pinyin)
        _bz2p = patt_s.match(_OPTIONS.pinyin)
        if _bz2p:
            pinyin_s = bz2.BZ2File(_OPTIONS.pinyin, "r")
        else:
            pinyin_s = open(_OPTIONS.pinyin, 'r')
        debug_print('\tParsing pinyin source file ')
        pyline = parse_pinyin(pinyin_s)
        debug_print('\tPreapring pinyin entries')
        pinyin = pinyin_parser(pyline)
        debug_print('\t  add pinyin into DB ')
        db.add_pinyin(pinyin)

    debug_print('Optimizing database ')
    db.optimize_database()

    if (db.ime_properties.get('user_can_define_phrase').lower() == u'true'
            and _OPTIONS.extra):
        debug_print('\tPreparing for adding extra words')
        db.create_indexes('main')
        debug_print('\tLoad extra words source "%s"' % _OPTIONS.extra)
        _bz2p = patt_s.match(_OPTIONS.extra)
        if _bz2p:
            extra_s = bz2.BZ2File(_OPTIONS.extra, 'r')
        else:
            extra_s = open(_OPTIONS.extra, 'r')
        debug_print('\tParsing extra words source file ')
        extraline = parse_extra(extra_s)
        debug_print('\tPreparing extra words lines')
        extrawords = extra_parser(extraline)
        debug_print('\t  we have %d extra phrases from source'
                    % len(extrawords))
        # first get the entry of original phrases from
        # phrases-[(xingma, phrase, int(freq), 0)]
        orig_phrases = {}
        for phrase in phrases:
            orig_phrases.update({"%s\t%s" % (phrase[0], phrase[1]): phrase})
        debug_print('\t  the len of orig_phrases is: %d' % len(orig_phrases))
        extra_phrases = {}
        for extraword in extrawords:
            extra_phrases.update(
                {"%s\t%s" % (extraword[0], extraword[1]): extraword})
        debug_print('\t  the len of extra_phrases is: %d' % len(extra_phrases))
        # pop duplicated keys
        for phrase in extra_phrases:
            if phrase in orig_phrases:
                extra_phrases.pop(phrase)
        debug_print('\t  %d extra phrases will be added' % len(extra_phrases))
        new_phrases = list(extra_phrases.values())
        debug_print('\tAdding extra words into DB ')
        db.add_phrases(new_phrases)
        debug_print('Optimizing database ')
        db.optimize_database()

    if _OPTIONS.index:
        debug_print('Create Indexes ')
        db.create_indexes('main')
    else:
        debug_print(
            "We don't create an index on the database, "
            + "you should only activate this function "
            + "for distribution purposes.")
        db.drop_indexes('main')
    debug_print('Done! :D')

if __name__ == "__main__":
    main()
Name	Type	Size	Permission
__pycache__	Folder		0775
chinese_variants.py	File	92.73 KB	0644
factory.py	File	3.95 KB	0644
ibus_table_location.py	File	3.83 KB	0644
it_util.py	File	2.13 KB	0644
main.py	File	11.44 KB	0644
tabcreatedb.py	File	15.43 KB	0644
table.py	File	140.45 KB	0644
tabsqlitedb.py	File	63.43 KB	0644
[ Avaa Bypassed ]

Upload:

Command:

Filemanager

Server Info

System Info

User Info