This source file includes following definitions.
- SplitDicLine
- PopulateWordSet
#include "chrome/tools/convert_dict/dic_reader.h"
#include <algorithm>
#include <set>
#include "base/file_util.h"
#include "base/strings/string_util.h"
#include "chrome/tools/convert_dict/aff_reader.h"
#include "chrome/tools/convert_dict/hunspell_reader.h"
namespace convert_dict {
namespace {
typedef std::map<std::string, std::set<int> > WordSet;
void SplitDicLine(const std::string& line, std::vector<std::string>* output) {
size_t slash_index = line.size();
for (size_t i = 0; i < line.size(); i++) {
if (line[i] == '/' && i > 0 && line[i - 1] != '\\') {
slash_index = i;
break;
}
}
output->clear();
std::string word = line.substr(0, slash_index);
ReplaceSubstringsAfterOffset(&word, 0, "\\/", "/");
output->push_back(word);
if (slash_index < line.size() - 1)
output->push_back(line.substr(slash_index + 1));
}
bool PopulateWordSet(WordSet* word_set, FILE* file, AffReader* aff_reader,
const char* file_type, const char* encoding,
bool file_has_word_count_in_the_first_line) {
int line_number = 0;
while (!feof(file)) {
std::string line = ReadLine(file);
line_number++;
StripComment(&line);
if (line.empty())
continue;
if (file_has_word_count_in_the_first_line) {
file_has_word_count_in_the_first_line = false;
continue;
}
std::vector<std::string> split;
SplitDicLine(line, &split);
if (split.empty() || split.size() > 2) {
printf("Line %d has extra slashes in the %s file\n", line_number,
file_type);
return false;
}
std::string utf8word;
std::string encoding_string(encoding);
if (encoding_string == "UTF-8") {
utf8word = split[0];
} else if (!aff_reader->EncodingToUTF8(split[0], &utf8word)) {
printf("Unable to convert line %d from %s to UTF-8 in the %s file\n",
line_number, encoding, file_type);
return false;
}
int affix_index = 0;
if (split.size() == 2) {
size_t split1_tab_offset = split[1].find('\t');
if (split1_tab_offset != std::string::npos)
split[1] = split[1].substr(0, split1_tab_offset);
if (aff_reader->has_indexed_affixes())
affix_index = atoi(split[1].c_str());
else
affix_index = aff_reader->GetAFIndexForAFString(split[1]);
}
size_t word_tab_offset = utf8word.find('\t');
if (word_tab_offset != std::string::npos)
utf8word = utf8word.substr(0, word_tab_offset);
WordSet::iterator found = word_set->find(utf8word);
std::set<int> affix_vector;
affix_vector.insert(affix_index);
if (found == word_set->end())
word_set->insert(std::make_pair(utf8word, affix_vector));
else
found->second.insert(affix_index);
}
return true;
}
}
DicReader::DicReader(const base::FilePath& path) {
file_ = base::OpenFile(path, "r");
base::FilePath additional_path =
path.ReplaceExtension(FILE_PATH_LITERAL("dic_delta"));
additional_words_file_ = base::OpenFile(additional_path, "r");
if (additional_words_file_)
printf("Reading %" PRFilePath " ...\n", additional_path.value().c_str());
else
printf("%" PRFilePath " not found.\n", additional_path.value().c_str());
}
DicReader::~DicReader() {
if (file_)
base::CloseFile(file_);
if (additional_words_file_)
base::CloseFile(additional_words_file_);
}
bool DicReader::Read(AffReader* aff_reader) {
if (!file_)
return false;
WordSet word_set;
if (!PopulateWordSet(&word_set, file_, aff_reader, "dic",
aff_reader->encoding(), true))
return false;
if (additional_words_file_ != NULL) {
PopulateWordSet(&word_set, additional_words_file_, aff_reader, "dic delta",
"UTF-8", false);
}
for (WordSet::iterator word = word_set.begin(); word != word_set.end();
++word) {
std::vector<int> affixes;
for (std::set<int>::iterator aff = word->second.begin();
aff != word->second.end(); ++aff)
affixes.push_back(*aff);
std::sort(affixes.begin(), affixes.end());
std::reverse(affixes.begin(), affixes.end());
words_.push_back(std::make_pair(word->first, affixes));
}
std::sort(words_.begin(), words_.end());
return true;
}
}