NGram  ngram-1.3.16
OpenGrm-NGram library
ngramshrink-main.cc
Go to the documentation of this file.
1 // Copyright 2005-2013 Brian Roark
2 // Copyright 2005-2024 Google LLC
3 //
4 // Licensed under the Apache License, Version 2.0 (the 'License');
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an 'AS IS' BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 // Shrinks an input n-gram model using given pruning criteria.
17 
18 #include <cstring>
19 #include <fstream>
20 #include <memory>
21 #include <set>
22 #include <string>
23 #include <vector>
24 
25 #include <fst/flags.h>
26 #include <fst/arc.h>
27 #include <fst/mutable-fst.h>
28 #include <ngram/ngram-list-prune.h>
29 #include <ngram/ngram-shrink.h>
30 
31 DECLARE_double(total_unigram_count);
32 DECLARE_double(theta);
33 DECLARE_int64(target_number_of_ngrams);
34 DECLARE_int32(min_order_to_prune);
35 DECLARE_string(method);
36 DECLARE_string(list_file);
37 DECLARE_string(count_pattern);
38 DECLARE_string(context_pattern);
39 DECLARE_int32(shrink_opt);
40 DECLARE_int64(backoff_label);
41 DECLARE_double(norm_eps);
42 DECLARE_bool(check_consistency);
43 DECLARE_bool(retry_downcase);
44 
45 int ngramshrink_main(int argc, char **argv) {
46  std::string usage = "Shrink n-gram model from input model file.\n\n Usage: ";
47  usage += argv[0];
48  usage += " [--options] [in.fst [out.fst]]\n";
49  SET_FLAGS(usage.c_str(), &argc, &argv, true);
50 
51  if (argc > 3) {
52  ShowUsage();
53  return 1;
54  }
55 
56  std::string in_name =
57  (argc > 1 && (strcmp(argv[1], "-") != 0)) ? argv[1] : "";
58  std::string out_name = argc > 2 ? argv[2] : "";
59 
60  std::unique_ptr<fst::StdMutableFst> fst(
61  fst::StdMutableFst::Read(in_name, true));
62  if (!fst) return 1;
63 
64  std::set<std::vector<fst::StdArc::Label>> ngram_list;
65  if (FST_FLAGS_method == "list_prune") {
66  if (FST_FLAGS_list_file.empty()) {
67  LOG(WARNING) << "list_file parameter empty, no n-grams given";
68  return 1;
69  }
70  std::ifstream ifstrm(FST_FLAGS_list_file);
71  if (!ifstrm) {
72  LOG(WARNING) << "NGramShrink: Can't open "
73  << FST_FLAGS_list_file << " for reading";
74  return 1;
75  }
76  std::string line;
77  std::vector<std::string> ngrams_to_prune;
78  while (std::getline(ifstrm, line)) {
79  ngrams_to_prune.push_back(line);
80  }
81  ifstrm.close();
82  ngram::GetNGramListToPrune(ngrams_to_prune, fst->InputSymbols(),
83  &ngram_list,
84  FST_FLAGS_retry_downcase);
85  }
87  fst.get(), FST_FLAGS_method, ngram_list,
88  FST_FLAGS_total_unigram_count, FST_FLAGS_theta,
89  FST_FLAGS_target_number_of_ngrams,
90  FST_FLAGS_min_order_to_prune,
91  FST_FLAGS_count_pattern,
92  FST_FLAGS_context_pattern, FST_FLAGS_shrink_opt,
93  FST_FLAGS_backoff_label, FST_FLAGS_norm_eps,
94  FST_FLAGS_check_consistency))
95  return 1;
96 
97  fst->Write(out_name);
98 
99  return 0;
100 }
DECLARE_bool(check_consistency)
void GetNGramListToPrune(const std::vector< std::string > &ngrams_to_prune, const fst::SymbolTable *syms, std::set< std::vector< fst::StdArc::Label >> *ngram_list, bool retry_downcase=false)
int ngramshrink_main(int argc, char **argv)
DECLARE_string(method)
bool NGramShrinkModel(fst::StdMutableFst *fst, const std::string &method, const std::set< std::vector< fst::StdArc::Label >> &ngram_list, double tot_uni=-1.0, double theta=0.0, int64_t target_num=-1, int32_t min_order=2, const std::string &count_pattern="", std::string_view context_pattern="", int shrink_opt=0, fst::StdArc::Label backoff_label=0, double norm_eps=kNormEps, bool check_consistency=false)
Definition: ngram-shrink.cc:68
DECLARE_int64(target_number_of_ngrams)
DECLARE_double(total_unigram_count)
DECLARE_int32(min_order_to_prune)