Skip to content

Commit

Permalink
feat(dictionary): packs extends the dictionary with extra binary tabl…
Browse files Browse the repository at this point in the history
…e files
  • Loading branch information
kionz committed Jul 26, 2020
1 parent c83b246 commit 930074c
Show file tree
Hide file tree
Showing 12 changed files with 303 additions and 125 deletions.
3 changes: 3 additions & 0 deletions src/rime/algo/utilities.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@ int CompareVersionString(const string& x, const string& y) {
return 0;
}

ChecksumComputer::ChecksumComputer(uint32_t initial_remainder)
: crc_(initial_remainder) {}

void ChecksumComputer::ProcessFile(const string& file_name) {
std::ifstream fin(file_name.c_str());
string file_content((std::istreambuf_iterator<char>(fin)),
Expand Down
1 change: 1 addition & 0 deletions src/rime/algo/utilities.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ int CompareVersionString(const string& x,

class ChecksumComputer {
public:
explicit ChecksumComputer(uint32_t initial_remainder = 0);
void ProcessFile(const string& file_name);
uint32_t Checksum();

Expand Down
218 changes: 149 additions & 69 deletions src/rime/dict/dict_compiler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,85 +26,103 @@ namespace rime {

DictCompiler::DictCompiler(Dictionary *dictionary, const string& prefix)
: dict_name_(dictionary->name()),
packs_(dictionary->packs()),
prism_(dictionary->prism()),
table_(dictionary->table()),
tables_(dictionary->tables()),
prefix_(prefix) {
}

static string LocateFile(const string& file_name) {
static string locate_file(const string& file_name) {
the<ResourceResolver> resolver(
Service::instance().CreateResourceResolver({"build_source", "", ""}));
return resolver->ResolvePath(file_name).string();
}

static bool load_dict_settings_from_file(DictSettings* settings,
const string& dict_file) {
std::ifstream fin(dict_file.c_str());
bool success = settings->LoadDictHeader(fin);
fin.close();
return success;
}

static bool get_dict_files_from_settings(vector<string>* dict_files,
DictSettings& settings) {
if (auto tables = settings.GetTables()) {
for(auto it = tables->begin(); it != tables->end(); ++it) {
string dict_name = As<ConfigValue>(*it)->str();
string dict_file = locate_file(dict_name + ".dict.yaml");
if (!boost::filesystem::exists(dict_file)) {
LOG(ERROR) << "source file '" << dict_file << "' does not exist.";
return false;
}
dict_files->push_back(dict_file);
}
}
return true;
}

static uint32_t compute_dict_file_checksum(uint32_t initial_checksum,
const vector<string>& dict_files,
DictSettings& settings) {
if (dict_files.empty()) {
return initial_checksum;
}
ChecksumComputer cc(initial_checksum);
for (const auto& file_name : dict_files) {
cc.ProcessFile(file_name);
}
if (settings.use_preset_vocabulary()) {
cc.ProcessFile(PresetVocabulary::DictFilePath(settings.vocabulary()));
}
return cc.Checksum();
}

bool DictCompiler::Compile(const string &schema_file) {
LOG(INFO) << "compiling dictionary for " << schema_file;
bool build_table_from_source = true;
DictSettings settings;
string dict_file = LocateFile(dict_name_ + ".dict.yaml");
string dict_file = locate_file(dict_name_ + ".dict.yaml");
if (!boost::filesystem::exists(dict_file)) {
LOG(ERROR) << "source file '" << dict_file << "' does not exist.";
build_table_from_source = false;
}
else {
std::ifstream fin(dict_file.c_str());
if (!settings.LoadDictHeader(fin)) {
LOG(ERROR) << "failed to load settings from '" << dict_file << "'.";
return false;
}
fin.close();
LOG(INFO) << "dict name: " << settings.dict_name();
LOG(INFO) << "dict version: " << settings.dict_version();
else if (!load_dict_settings_from_file(&settings, dict_file)) {
LOG(ERROR) << "failed to load settings from '" << dict_file << "'.";
return false;
}
vector<string> dict_files;
auto tables = settings.GetTables();
for(auto it = tables->begin(); it != tables->end(); ++it) {
if (!Is<ConfigValue>(*it))
continue;
string dict_name = As<ConfigValue>(*it)->str();
string dict_file = LocateFile(dict_name + ".dict.yaml");
if (!boost::filesystem::exists(dict_file)) {
LOG(ERROR) << "source file '" << dict_file << "' does not exist.";
return false;
}
dict_files.push_back(dict_file);
}
uint32_t dict_file_checksum = 0;
if (!dict_files.empty()) {
ChecksumComputer cc;
for (const auto& file_name : dict_files) {
cc.ProcessFile(file_name);
}
if (settings.use_preset_vocabulary()) {
cc.ProcessFile(PresetVocabulary::DictFilePath(settings.vocabulary()));
}
dict_file_checksum = cc.Checksum();
if (!get_dict_files_from_settings(&dict_files, settings)) {
return false;
}
uint32_t dict_file_checksum =
compute_dict_file_checksum(0, dict_files, settings);
uint32_t schema_file_checksum =
schema_file.empty() ? 0 : Checksum(schema_file);
bool rebuild_table = true;
bool rebuild_prism = true;
if (table_->Exists() && table_->Load()) {
if (!build_table_from_source) {
dict_file_checksum = table_->dict_file_checksum();
LOG(INFO) << "reuse existing table: " << table_->file_name();
}
if (table_->dict_file_checksum() == dict_file_checksum) {
rebuild_table = false;
bool rebuild_table = false;
bool rebuild_prism = false;
const auto& primary_table = tables_[0];
if (primary_table->Exists() && primary_table->Load()) {
if (build_table_from_source) {
rebuild_table = primary_table->dict_file_checksum() != dict_file_checksum;
} else {
dict_file_checksum = primary_table->dict_file_checksum();
LOG(INFO) << "reuse existing table: " << primary_table->file_name();
}
table_->Close();
}
else if (!build_table_from_source) {
primary_table->Close();
} else if (build_table_from_source) {
rebuild_table = true;
} else {
LOG(ERROR) << "neither " << dict_name_ << ".dict.yaml nor "
<< dict_name_ << ".table.bin exists.";
return false;
}
if (prism_->Exists() && prism_->Load()) {
if (prism_->dict_file_checksum() == dict_file_checksum &&
prism_->schema_file_checksum() == schema_file_checksum) {
rebuild_prism = false;
}
rebuild_prism = prism_->dict_file_checksum() != dict_file_checksum ||
prism_->schema_file_checksum() != schema_file_checksum;
prism_->Close();
} else {
rebuild_prism = true;
}
LOG(INFO) << dict_file << "[" << dict_files.size() << " file(s)]"
<< " (" << dict_file_checksum << ")";
Expand All @@ -126,11 +144,55 @@ bool DictCompiler::Compile(const string &schema_file) {
if (options_ & kRebuildPrism) {
rebuild_prism = true;
}
if (rebuild_table && !BuildTable(&settings, dict_files, dict_file_checksum))
return false;
if (rebuild_prism && !BuildPrism(schema_file,
dict_file_checksum, schema_file_checksum))
Syllabary syllabary;
if (rebuild_table) {
EntryCollector collector;
if (!BuildTable(0,
collector,
&settings,
dict_files,
dict_file_checksum)) {
return false;
}
syllabary = std::move(collector.syllabary);
}
if (rebuild_prism &&
!BuildPrism(schema_file,
syllabary,
dict_file_checksum,
schema_file_checksum)) {
return false;
}
if (rebuild_table) {
for (int table_index = 1; table_index < tables_.size(); ++table_index) {
const auto& pack_name = packs_[table_index - 1];
EntryCollector collector(std::move(syllabary));
DictSettings settings;
string dict_file = locate_file(pack_name + ".dict.yaml");
if (!boost::filesystem::exists(dict_file)) {
LOG(ERROR) << "source file '" << dict_file << "' does not exist.";
continue;
}
if (!load_dict_settings_from_file(&settings, dict_file)) {
LOG(ERROR) << "failed to load settings from '" << dict_file << "'.";
continue;
}
vector<string> dict_files;
if (!get_dict_files_from_settings(&dict_files, settings)) {
continue;
}
uint32_t pack_file_checksum =
compute_dict_file_checksum(dict_file_checksum, dict_files, settings);
if (!BuildTable(table_index,
collector,
&settings,
dict_files,
pack_file_checksum)) {
LOG(ERROR) << "failed to build pack: " << pack_name;
}
syllabary = std::move(collector.syllabary);
}
}
// done!
return true;
}
Expand All @@ -143,17 +205,20 @@ static string RelocateToUserDirectory(const string& prefix,
return resolver.ResolvePath(resource_id).string();
}

bool DictCompiler::BuildTable(DictSettings* settings,
bool DictCompiler::BuildTable(int table_index,
EntryCollector& collector,
DictSettings* settings,
const vector<string>& dict_files,
uint32_t dict_file_checksum) {
LOG(INFO) << "building table...";
table_ = New<Table>(RelocateToUserDirectory(prefix_, table_->file_name()));
auto& table = tables_[table_index];
auto path = RelocateToUserDirectory(prefix_, table->file_name());
LOG(INFO) << "building table: " << path;
table = New<Table>(path);

EntryCollector collector;
collector.Configure(settings);
collector.Collect(dict_files);
if (options_ & kDump) {
boost::filesystem::path path(table_->file_name());
boost::filesystem::path path(table->file_name());
path.replace_extension(".txt");
collector.Dump(path.string());
}
Expand Down Expand Up @@ -184,16 +249,34 @@ bool DictCompiler::BuildTable(DictSettings* settings,
if (settings->sort_order() != "original") {
vocabulary.SortHomophones();
}
table_->Remove();
if (!table_->Build(collector.syllabary, vocabulary, collector.num_entries,
dict_file_checksum) ||
!table_->Save()) {
table->Remove();
if (!table->Build(collector.syllabary,
vocabulary,
collector.num_entries,
dict_file_checksum) ||
!table->Save()) {
return false;
}
}
// build reverse db for the primary table
if (table_index == 0 &&
!BuildReverseDb(settings,
collector,
vocabulary,
dict_file_checksum)) {
return false;
}
return true;
}

bool DictCompiler::BuildReverseDb(DictSettings* settings,
const EntryCollector& collector,
const Vocabulary& vocabulary,
uint32_t dict_file_checksum) {
// build .reverse.bin
ReverseDb reverse_db(RelocateToUserDirectory(prefix_,
dict_name_ + ".reverse.bin"));
auto path = RelocateToUserDirectory(prefix_,
dict_name_ + ".reverse.bin");
ReverseDb reverse_db(path);
if (!reverse_db.Build(settings,
collector.syllabary,
vocabulary,
Expand All @@ -206,15 +289,12 @@ bool DictCompiler::BuildTable(DictSettings* settings,
}

bool DictCompiler::BuildPrism(const string &schema_file,
const Syllabary& syllabary,
uint32_t dict_file_checksum,
uint32_t schema_file_checksum) {
LOG(INFO) << "building prism...";
prism_ = New<Prism>(RelocateToUserDirectory(prefix_, prism_->file_name()));

// get syllabary from table
Syllabary syllabary;
if (!table_->Load() || !table_->GetSyllabary(&syllabary) || syllabary.empty())
return false;
// apply spelling algebra and prepare corrections (if enabled)
Script script;
if (!schema_file.empty()) {
Expand Down
17 changes: 13 additions & 4 deletions src/rime/dict/dict_compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ class Table;
class ReverseDb;
class DictSettings;
class EditDistanceCorrector;
class EntryCollector;
class Vocabulary;

class DictCompiler {
public:
Expand All @@ -34,18 +36,25 @@ class DictCompiler {
void set_options(int options) { options_ = options; }

private:
bool BuildTable(DictSettings* settings,
bool BuildTable(int table_index,
EntryCollector& collector,
DictSettings* settings,
const vector<string>& dict_files,
uint32_t dict_file_checksum);
bool BuildPrism(const string& schema_file,
const Syllabary& syllabary,
uint32_t dict_file_checksum,
uint32_t schema_file_checksum);
bool BuildReverseLookupDict(ReverseDb* db, uint32_t dict_file_checksum);
bool BuildReverseDb(DictSettings* settings,
const EntryCollector& collector,
const Vocabulary& vocabulary,
uint32_t dict_file_checksum);

string dict_name_;
const string& dict_name_;
const vector<string>& packs_;
an<Prism> prism_;
an<EditDistanceCorrector> correction_;
an<Table> table_;
vector<of<Table>> tables_;
int options_ = 0;
string prefix_;
};
Expand Down
6 changes: 6 additions & 0 deletions src/rime/dict/dict_settings.cc
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@ bool DictSettings::LoadDictHeader(std::istream& stream) {
return true;
}

bool DictSettings::empty() {
return (*this)["name"].IsNull();
}

string DictSettings::dict_name() {
return (*this)["name"].ToString();
}
Expand Down Expand Up @@ -74,6 +78,8 @@ double DictSettings::min_phrase_weight() {
}

an<ConfigList> DictSettings::GetTables() {
if (empty())
return nullptr;
auto tables = New<ConfigList>();
tables->Append((*this)["name"]);
auto imports = (*this)["import_tables"].AsList();
Expand Down
1 change: 1 addition & 0 deletions src/rime/dict/dict_settings.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ class DictSettings : public Config {
public:
DictSettings();
bool LoadDictHeader(std::istream& stream);
bool empty();
string dict_name();
string dict_version();
string sort_order();
Expand Down
Loading

0 comments on commit 930074c

Please sign in to comment.