NGram  ngram-1.3.16
OpenGrm-NGram library
ngramcount-main.cc
Go to the documentation of this file.
1 // Copyright 2005-2013 Brian Roark
2 // Copyright 2005-2024 Google LLC
3 //
4 // Licensed under the Apache License, Version 2.0 (the 'License');
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an 'AS IS' BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 // Counts n-grams from an input fst archive (FAR) file.
17 
18 #include <cstring>
19 #include <fstream>
20 #include <iostream>
21 #include <memory>
22 #include <ostream>
23 #include <string>
24 #include <vector>
25 
26 #include <fst/flags.h>
27 #include <fst/log.h>
28 #include <fst/extensions/far/far.h>
29 #include <fst/arc.h>
30 #include <fst/fst.h>
31 #include <fst/vector-fst.h>
32 #include <ngram/hist-arc.h>
33 #include <ngram/ngram-count.h>
34 
35 DECLARE_string(method);
37 
38 // For counting:
39 DECLARE_bool(round_to_int);
40 DECLARE_bool(output_fst);
41 DECLARE_bool(require_symbols);
42 DECLARE_double(add_to_symbol_unigram_count);
43 
44 // For counting and histograms:
45 DECLARE_bool(epsilon_as_backoff);
46 
47 // For count-of-counting:
48 DECLARE_string(context_pattern);
49 
50 // For merging:
51 DECLARE_double(alpha);
52 DECLARE_double(beta);
53 DECLARE_bool(normalize);
54 DECLARE_int64(backoff_label);
55 DECLARE_double(norm_eps);
56 DECLARE_bool(check_consistency);
57 
58 int ngramcount_main(int argc, char **argv) {
59  std::string usage = "Count n-grams from input file.\n\n Usage: ";
60  usage += argv[0];
61  usage += " [--options] [in.far [out.fst]]\n";
62  SET_FLAGS(usage.c_str(), &argc, &argv, true);
63 
64  if (argc > 3) {
65  ShowUsage();
66  return 1;
67  }
68 
69  std::string in_name =
70  (argc > 1 && (strcmp(argv[1], "-") != 0)) ? argv[1] : "";
71  std::string out_name =
72  (argc > 2 && (strcmp(argv[2], "-") != 0)) ? argv[2] : "";
73 
74  bool ngrams_counted = false;
75  if (FST_FLAGS_method == "counts") {
76  std::unique_ptr<fst::FarReader<fst::StdArc>> far_reader(
77  fst::FarReader<fst::StdArc>::Open(in_name));
78  if (!far_reader) {
79  LOG(ERROR) << "ngramcount: open of FST archive failed: " << in_name;
80  return 1;
81  }
82  if (FST_FLAGS_output_fst) {
83  fst::StdVectorFst fst;
84  ngrams_counted = ngram::GetNGramCounts(
85  far_reader.get(), &fst, FST_FLAGS_order,
86  FST_FLAGS_require_symbols,
87  FST_FLAGS_epsilon_as_backoff,
88  FST_FLAGS_round_to_int,
89  FST_FLAGS_add_to_symbol_unigram_count);
90  if (ngrams_counted) fst.Write(out_name);
91  } else {
92  std::vector<std::string> ngram_counts;
93  ngrams_counted = ngram::GetNGramCounts(
94  far_reader.get(), &ngram_counts, FST_FLAGS_order,
95  FST_FLAGS_epsilon_as_backoff,
96  FST_FLAGS_add_to_symbol_unigram_count);
97  std::ofstream ofstrm;
98  if (!out_name.empty()) {
99  ofstrm.open(out_name);
100  if (!ofstrm) {
101  LOG(ERROR) << "GetNGramCounts: Open failed, file = " << out_name;
102  return 1;
103  }
104  }
105  std::ostream &ostrm = ofstrm.is_open() ? ofstrm : std::cout;
106  for (size_t i = 0; i < ngram_counts.size(); ++i)
107  ostrm << ngram_counts[i] << std::endl;
108  }
109  } else if (FST_FLAGS_method == "histograms") {
110  std::unique_ptr<fst::FarReader<fst::StdArc>> far_reader(
111  fst::FarReader<fst::StdArc>::Open(in_name));
112  if (!far_reader) {
113  LOG(ERROR) << "ngramhistcount: open of FST archive failed: " << in_name;
114  return 1;
115  }
116  fst::VectorFst<ngram::HistogramArc> fst;
117  ngrams_counted = ngram::GetNGramHistograms(
118  far_reader.get(), &fst, FST_FLAGS_order,
119  FST_FLAGS_epsilon_as_backoff,
120  FST_FLAGS_backoff_label, FST_FLAGS_norm_eps,
121  FST_FLAGS_check_consistency, FST_FLAGS_normalize,
122  FST_FLAGS_alpha, FST_FLAGS_beta);
123  if (ngrams_counted) fst.Write(out_name);
124  } else if (FST_FLAGS_method == "count_of_counts" ||
125  FST_FLAGS_method == "count_of_histograms") {
126  ngrams_counted = true;
127  fst::StdVectorFst ccfst;
128  if (FST_FLAGS_method == "count_of_counts") {
129  std::unique_ptr<fst::StdFst> fst(fst::StdFst::Read(in_name));
130  if (!fst) return 1;
131  ngram::GetNGramCountOfCounts<fst::StdArc>(
132  *fst, &ccfst, FST_FLAGS_order,
133  FST_FLAGS_context_pattern);
134  } else {
135  std::unique_ptr<fst::VectorFst<ngram::HistogramArc>> fst(
136  fst::VectorFst<ngram::HistogramArc>::Read(in_name));
137  if (!fst) return 1;
138  ngram::GetNGramCountOfCounts<ngram::HistogramArc>(
139  *fst, &ccfst, FST_FLAGS_order,
140  FST_FLAGS_context_pattern);
141  }
142  ccfst.Write(out_name);
143  } else {
144  LOG(ERROR) << argv[0]
145  << ": bad counting method: " << FST_FLAGS_method;
146  }
147  return !ngrams_counted;
148 }
int ngramcount_main(int argc, char **argv)
DECLARE_string(method)
DECLARE_bool(round_to_int)
bool GetNGramHistograms(fst::FarReader< fst::StdArc > *far_reader, fst::VectorFst< HistogramArc > *fst, int order, bool epsilon_as_backoff=false, int backoff_label=0, double norm_eps=kNormEps, bool check_consistency=false, bool normalize=false, double alpha=1.0, double beta=1.0)
Definition: ngram-count.cc:133
DECLARE_int64(order)
DECLARE_double(add_to_symbol_unigram_count)
bool GetNGramCounts(fst::FarReader< fst::StdArc > *far_reader, fst::StdMutableFst *fst, int order, bool require_symbols=true, bool epsilon_as_backoff=false, bool round_to_int=false, double add_to_symbol_unigram_count=0.0)
Definition: ngram-count.cc:187