GRM-SFST  sfst-1.2.1
OpenGrm SFst Library
ngramapprox.h
Go to the documentation of this file.
1 // Copyright 2018-2024 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the 'License');
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an 'AS IS' BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 // Algorithm to approximate a stochastic FST as an n-gram model.
15 // The output is a canonical and normalized OpenGrm-NGram model.
16 
17 #ifndef NLP_GRM2_SFST_NGRAMAPPROX_H_
18 #define NLP_GRM2_SFST_NGRAMAPPROX_H_
19 
20 
21 #include <fst/fst.h>
22 #include <sfst/approx.h>
23 #include <sfst/normalize.h>
24 #include <sfst/sfst.h>
25 #include <sfst/topology.h>
26 
27 namespace sfst {
28 
29 // Approximates a stochastic FSA as an n-gram model of order 'order'. The input
30 // FST should be a canonical stochastic FSA (see canonical.h). If it is cyclic,
31 // it should be normalized (see normalize.h - not checked). Assumes input has
32 // no (non-phi) epsilons (or treats such epsilons w.r.t. the failure semantics
33 // as if they were regular, uniquely-labeled symbols). The 'phi_label' is the
34 // failure label (defaults to OpenGrm NGram backoff label of 0). The 'delta'
35 // parameter controls the degree of algorithm convergence. The result is a
36 // canonical and normalized OpenGrm ngram model FST. The algorithm computes
37 // (smoothed) counts and then normalizes those counts. See sfst::CountNormType
38 // for the normalization variants. Returns true on success.
39 template <class Arc>
40 bool NGramApprox(const fst::Fst<Arc> &ifst,
41  fst::MutableFst<Arc> *ofst, int order,
42  typename Arc::Label phi_label = 0,
43  float delta = sfst::kApproxDelta,
44  CountNormType norm_type = NORM_KL_MIN) {
45  namespace f = fst;
46  using Label = typename Arc::Label;
47 
48  { // Finds the n-gram topology.
49  NGramTopology<Arc> ngram(order, phi_label, ofst);
50  ngram.FindNGrams(ifst);
51  if (ofst->Properties(f::kError, false)) return false;
52  }
53 
54  return Approx(ifst, ofst, phi_label, delta, norm_type);
55 }
56 
57 } // namespace sfst
58 
59 #endif // NLP_GRM2_SFST_NGRAMAPPROX_H_
Definition: perplexity.h:32
Definition: sfstinfo.cc:39
void FindNGrams(const fst::Fst< Arc > &ifst)
Definition: topology.h:132
CountNormType
Definition: normalize.h:371
constexpr float kApproxDelta
Definition: approx.h:30
bool Approx(const fst::Fst< Arc > &ifst, fst::MutableFst< Arc > *ofst, typename Arc::Label phi_label=fst::kNoLabel, float delta=kApproxDelta, CountNormType norm_type=NORM_KL_MIN)
Definition: approx.h:46
bool NGramApprox(const fst::Fst< Arc > &ifst, fst::MutableFst< Arc > *ofst, int order, typename Arc::Label phi_label=0, float delta=sfst::kApproxDelta, CountNormType norm_type=NORM_KL_MIN)
Definition: ngramapprox.h:40