26 #include <fst/flags.h> 28 #include <fst/extensions/far/far.h> 31 #include <fst/vector-fst.h> 59 std::string usage =
"Count n-grams from input file.\n\n Usage: ";
61 usage +=
" [--options] [in.far [out.fst]]\n";
62 SET_FLAGS(usage.c_str(), &argc, &argv,
true);
70 (argc > 1 && (strcmp(argv[1],
"-") != 0)) ? argv[1] :
"";
71 std::string out_name =
72 (argc > 2 && (strcmp(argv[2],
"-") != 0)) ? argv[2] :
"";
74 bool ngrams_counted =
false;
75 if (FST_FLAGS_method ==
"counts") {
76 std::unique_ptr<fst::FarReader<fst::StdArc>> far_reader(
77 fst::FarReader<fst::StdArc>::Open(in_name));
79 LOG(ERROR) <<
"ngramcount: open of FST archive failed: " << in_name;
82 if (FST_FLAGS_output_fst) {
83 fst::StdVectorFst
fst;
85 far_reader.get(), &fst, FST_FLAGS_order,
86 FST_FLAGS_require_symbols,
87 FST_FLAGS_epsilon_as_backoff,
88 FST_FLAGS_round_to_int,
89 FST_FLAGS_add_to_symbol_unigram_count);
90 if (ngrams_counted) fst.Write(out_name);
92 std::vector<std::string> ngram_counts;
94 far_reader.get(), &ngram_counts, FST_FLAGS_order,
95 FST_FLAGS_epsilon_as_backoff,
96 FST_FLAGS_add_to_symbol_unigram_count);
98 if (!out_name.empty()) {
99 ofstrm.open(out_name);
101 LOG(ERROR) <<
"GetNGramCounts: Open failed, file = " << out_name;
105 std::ostream &ostrm = ofstrm.is_open() ? ofstrm : std::cout;
106 for (
size_t i = 0; i < ngram_counts.size(); ++i)
107 ostrm << ngram_counts[i] << std::endl;
109 }
else if (FST_FLAGS_method ==
"histograms") {
110 std::unique_ptr<fst::FarReader<fst::StdArc>> far_reader(
111 fst::FarReader<fst::StdArc>::Open(in_name));
113 LOG(ERROR) <<
"ngramhistcount: open of FST archive failed: " << in_name;
116 fst::VectorFst<ngram::HistogramArc>
fst;
118 far_reader.get(), &fst, FST_FLAGS_order,
119 FST_FLAGS_epsilon_as_backoff,
120 FST_FLAGS_backoff_label, FST_FLAGS_norm_eps,
121 FST_FLAGS_check_consistency, FST_FLAGS_normalize,
122 FST_FLAGS_alpha, FST_FLAGS_beta);
123 if (ngrams_counted) fst.Write(out_name);
124 }
else if (FST_FLAGS_method ==
"count_of_counts" ||
125 FST_FLAGS_method ==
"count_of_histograms") {
126 ngrams_counted =
true;
127 fst::StdVectorFst ccfst;
128 if (FST_FLAGS_method ==
"count_of_counts") {
129 std::unique_ptr<fst::StdFst>
fst(fst::StdFst::Read(in_name));
131 ngram::GetNGramCountOfCounts<fst::StdArc>(
132 *fst, &ccfst, FST_FLAGS_order,
133 FST_FLAGS_context_pattern);
135 std::unique_ptr<fst::VectorFst<ngram::HistogramArc>>
fst(
136 fst::VectorFst<ngram::HistogramArc>::Read(in_name));
138 ngram::GetNGramCountOfCounts<ngram::HistogramArc>(
139 *fst, &ccfst, FST_FLAGS_order,
140 FST_FLAGS_context_pattern);
142 ccfst.Write(out_name);
144 LOG(ERROR) << argv[0]
145 <<
": bad counting method: " << FST_FLAGS_method;
147 return !ngrams_counted;
int ngramcount_main(int argc, char **argv)
DECLARE_bool(round_to_int)
bool GetNGramHistograms(fst::FarReader< fst::StdArc > *far_reader, fst::VectorFst< HistogramArc > *fst, int order, bool epsilon_as_backoff=false, int backoff_label=0, double norm_eps=kNormEps, bool check_consistency=false, bool normalize=false, double alpha=1.0, double beta=1.0)
DECLARE_double(add_to_symbol_unigram_count)
bool GetNGramCounts(fst::FarReader< fst::StdArc > *far_reader, fst::StdMutableFst *fst, int order, bool require_symbols=true, bool epsilon_as_backoff=false, bool round_to_int=false, double add_to_symbol_unigram_count=0.0)