Commit 5e899807 by 陶然

init

parent 51a59d4b
......@@ -4,3 +4,7 @@
/Src/obj/Debug
/Src/bin/Debug
/Jieba.Net/bin/Debug
/Jieba.Net/obj/Debug
/LDA/bin
/LDA/obj
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Reflection;
using System.Text;
namespace JiebaNet.Analyser
{
public class IdfLoader
{
internal string IdfFilePath { get; set; }
internal IDictionary<string, double> IdfFreq { get; set; }
internal double MedianIdf { get; set; }
public IdfLoader(string idfPath = null)
{
IdfFilePath = string.Empty;
IdfFreq = new Dictionary<string, double>();
MedianIdf = 0.0;
SetNewPath(idfPath);
//if (!string.IsNullOrWhiteSpace(idfPath))
//{
// SetNewPath(idfPath);
//}
}
public void SetNewPath(string newIdfPath)
{
using (var sr = new StreamReader(Assembly.GetExecutingAssembly().GetManifestResourceStream("Jieba.Net.Segmenter.Resources.idf.txt"), Encoding.UTF8))
{
IdfFreq = new Dictionary<string, double>();
string line = null;
while ((line = sr.ReadLine()) != null)
{
var parts = line.Trim().Split(' ');
var word = parts[0];
var freq = double.Parse(parts[1]);
IdfFreq[word] = freq;
}
MedianIdf = IdfFreq.Values.OrderBy(v => v).ToList()[IdfFreq.Count / 2];
}
//var idfPath = Path.GetFullPath(newIdfPath);
//if (IdfFilePath != idfPath)
//{
// IdfFilePath = idfPath;
// var lines = File.ReadAllLines(idfPath, Encoding.UTF8);
// IdfFreq = new Dictionary<string, double>();
// foreach (var line in lines)
// {
// var parts = line.Trim().Split(' ');
// var word = parts[0];
// var freq = double.Parse(parts[1]);
// IdfFreq[word] = freq;
// }
// MedianIdf = IdfFreq.Values.OrderBy(v => v).ToList()[IdfFreq.Count / 2];
//}
}
}
}
\ No newline at end of file
using System.Collections.Generic;
using System.IO;
using System.Reflection;
using System.Text;
namespace JiebaNet.Analyser
{
public abstract class KeywordExtractor
{
protected static readonly List<string> DefaultStopWords = new List<string>()
{
"the", "of", "is", "and", "to", "in", "that", "we", "for", "an", "are",
"by", "be", "as", "on", "with", "can", "if", "from", "which", "you", "it",
"this", "then", "at", "have", "all", "not", "one", "has", "or", "that"
};
protected virtual ISet<string> StopWords { get; set; }
public void SetStopWords(string stopWordsFile = null)
{
StopWords = new HashSet<string>();
using (var sr = new StreamReader(Assembly.GetExecutingAssembly().GetManifestResourceStream("Jieba.Net.Segmenter.Resources.stopwords.txt"), Encoding.UTF8))
{
string line = null;
while ((line = sr.ReadLine()) != null)
{
StopWords.Add(line.Trim());
}
}
// var path = Path.GetFullPath(stopWordsFile);
//if (File.Exists(path))
//{
// var lines = File.ReadAllLines(path);
// foreach (var line in lines)
// {
// StopWords.Add(line.Trim());
// }
//}
}
public void AddStopWord(string word)
{
if (!StopWords.Contains(word))
{
StopWords.Add(word.Trim());
}
}
public void AddStopWords(IEnumerable<string> words)
{
foreach (var word in words)
{
AddStopWord(word);
}
}
public abstract IEnumerable<string> ExtractTags(string text, int count = 20, IEnumerable<string> allowPos = null);
public abstract IEnumerable<WordWeightPair> ExtractTagsWithWeight(string text, int count = 20, IEnumerable<string> allowPos = null);
}
}
\ No newline at end of file
using System.Collections.Generic;
using System.Linq;
using JiebaNet.Segmenter;
using JiebaNet.Segmenter.Common;
using JiebaNet.Segmenter.PosSeg;
namespace JiebaNet.Analyser
{
public class TextRankExtractor : KeywordExtractor
{
private static readonly IEnumerable<string> DefaultPosFilter = new List<string>()
{
"n", "ng", "nr", "nrfg", "nrt", "ns", "nt", "nz", "v", "vd", "vg", "vi", "vn", "vq"
};
private JiebaSegmenter Segmenter { get; set; }
private PosSegmenter PosSegmenter { get; set; }
public int Span { get; set; }
public bool PairFilter(IEnumerable<string> allowPos, Pair wp)
{
return allowPos.Contains(wp.Flag)
&& wp.Word.Trim().Length >= 2
&& !StopWords.Contains(wp.Word.ToLower());
}
public TextRankExtractor()
{
Span = 5;
Segmenter = new JiebaSegmenter();
PosSegmenter = new PosSegmenter(Segmenter);
SetStopWords();
//SetStopWords(ConfigManager.StopWordsFile);
if (StopWords.IsEmpty())
{
StopWords.UnionWith(DefaultStopWords);
}
}
public override IEnumerable<string> ExtractTags(string text, int count = 20, IEnumerable<string> allowPos = null)
{
var rank = ExtractTagRank(text, allowPos);
if (count <= 0) { count = 20; }
return rank.OrderByDescending(p => p.Value).Select(p => p.Key).Take(count);
}
public override IEnumerable<WordWeightPair> ExtractTagsWithWeight(string text, int count = 20, IEnumerable<string> allowPos = null)
{
var rank = ExtractTagRank(text, allowPos);
if (count <= 0) { count = 20; }
return rank.OrderByDescending(p => p.Value).Select(p => new WordWeightPair()
{
Word = p.Key, Weight = p.Value
}).Take(count);
}
#region Private Helpers
private IDictionary<string, double> ExtractTagRank(string text, IEnumerable<string> allowPos)
{
if (allowPos.IsEmpty())
{
allowPos = DefaultPosFilter;
}
var g = new UndirectWeightedGraph();
var cm = new Dictionary<string, int>();
var words = PosSegmenter.Cut(text).ToList();
for (var i = 0; i < words.Count(); i++)
{
var wp = words[i];
if (PairFilter(allowPos, wp))
{
for (var j = i + 1; j < i + Span; j++)
{
if (j >= words.Count)
{
break;
}
if (!PairFilter(allowPos, words[j]))
{
continue;
}
// TODO: better separator.
var key = wp.Word + "$" + words[j].Word;
if (!cm.ContainsKey(key))
{
cm[key] = 0;
}
cm[key] += 1;
}
}
}
foreach (var p in cm)
{
var terms = p.Key.Split('$');
g.AddEdge(terms[0], terms[1], p.Value);
}
return g.Rank();
}
#endregion
}
}
\ No newline at end of file
using System;
using System.Collections.Generic;
using System.Linq;
using JiebaNet.Segmenter;
using JiebaNet.Segmenter.Common;
using JiebaNet.Segmenter.PosSeg;
namespace JiebaNet.Analyser
{
public class TfidfExtractor : KeywordExtractor
{
//private static readonly string DefaultIdfFile = ConfigManager.IdfFile;
private static readonly int DefaultWordCount = 20;
private JiebaSegmenter Segmenter { get; set; }
private PosSegmenter PosSegmenter { get; set; }
private IdfLoader Loader { get; set; }
private IDictionary<string, double> IdfFreq { get; set; }
private double MedianIdf { get; set; }
public TfidfExtractor(JiebaSegmenter segmenter = null)
{
Segmenter = segmenter.IsNull() ? new JiebaSegmenter() : segmenter;
PosSegmenter = new PosSegmenter(Segmenter);
SetStopWords();
//SetStopWords(ConfigManager.StopWordsFile);
if (StopWords.IsEmpty())
{
StopWords.UnionWith(DefaultStopWords);
}
Loader = new IdfLoader();
IdfFreq = Loader.IdfFreq;
MedianIdf = Loader.MedianIdf;
}
//public void SetIdfPath(string idfPath)
//{
// Loader.SetNewPath(idfPath);
// IdfFreq = Loader.IdfFreq;
// MedianIdf = Loader.MedianIdf;
//}
private IEnumerable<string> FilterCutByPos(string text, IEnumerable<string> allowPos)
{
var posTags = PosSegmenter.Cut(text).Where(p => allowPos.Contains(p.Flag));
return posTags.Select(p => p.Word);
}
private IDictionary<string, double> GetWordIfidf(string text, IEnumerable<string> allowPos)
{
IEnumerable<string> words = null;
if (allowPos.IsNotEmpty())
{
words = FilterCutByPos(text, allowPos);
}
else
{
words = Segmenter.Cut(text);
}
// Calculate TF
var freq = new Dictionary<string, double>();
foreach (var word in words)
{
var w = word;
if (string.IsNullOrEmpty(w) || w.Trim().Length < 2 || StopWords.Contains(w.ToLower()))
{
continue;
}
freq[w] = freq.GetDefault(w, 0.0) + 1.0;
}
var total = freq.Values.Sum();
foreach (var k in freq.Keys.ToList())
{
freq[k] *= IdfFreq.GetDefault(k, MedianIdf) / total;
}
return freq;
}
public override IEnumerable<string> ExtractTags(string text, int count = 20, IEnumerable<string> allowPos = null)
{
if (count <= 0) { count = DefaultWordCount; }
var freq = GetWordIfidf(text, allowPos);
return freq.OrderByDescending(p => p.Value).Select(p => p.Key).Take(count);
}
public override IEnumerable<WordWeightPair> ExtractTagsWithWeight(string text, int count = 20, IEnumerable<string> allowPos = null)
{
if (count <= 0) { count = DefaultWordCount; }
var freq = GetWordIfidf(text, allowPos);
return freq.OrderByDescending(p => p.Value).Select(p => new WordWeightPair()
{
Word = p.Key, Weight = p.Value
}).Take(count);
}
}
public class WordWeightPair
{
public string Word { get; set; }
public double Weight { get; set; }
}
}
using System;
using System.Collections.Generic;
using System.Linq;
namespace JiebaNet.Analyser
{
public class Edge
{
public string Start { get; set; }
public string End { get; set; }
public double Weight { get; set; }
}
public class UndirectWeightedGraph
{
private static readonly double d = 0.85;
public IDictionary<string, List<Edge>> Graph { get; set; }
public UndirectWeightedGraph()
{
Graph = new Dictionary<string, List<Edge>>();
}
public void AddEdge(string start, string end, double weight)
{
if (!Graph.ContainsKey(start))
{
Graph[start] = new List<Edge>();
}
if (!Graph.ContainsKey(end))
{
Graph[end] = new List<Edge>();
}
Graph[start].Add(new Edge(){ Start = start, End = end, Weight = weight });
Graph[end].Add(new Edge(){ Start = end, End = start, Weight = weight });
}
public IDictionary<string, double> Rank()
{
var ws = new Dictionary<string, double>();
var outSum = new Dictionary<string, double>();
// init scores
var count = Graph.Count > 0 ? Graph.Count : 1;
var wsdef = 1.0/count;
foreach (var pair in Graph)
{
ws[pair.Key] = wsdef;
outSum[pair.Key] = pair.Value.Sum(e => e.Weight);
}
// TODO: 10 iterations?
var sortedKeys = Graph.Keys.OrderBy(k => k);
for (var i = 0; i < 10; i++)
{
foreach (var n in sortedKeys)
{
var s = 0d;
foreach (var edge in Graph[n])
{
s += edge.Weight/outSum[edge.End]*ws[edge.End];
}
ws[n] = (1 - d) + d*s;
}
}
var minRank = double.MaxValue;
var maxRank = double.MinValue;
foreach (var w in ws.Values)
{
if (w < minRank)
{
minRank = w;
}
if(w > maxRank)
{
maxRank = w;
}
}
foreach (var pair in ws.ToList())
{
ws[pair.Key] = (pair.Value - minRank/10.0)/(maxRank - minRank/10.0);
}
return ws;
}
}
}
\ No newline at end of file
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props" Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" />
<PropertyGroup>
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
<Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
<ProjectGuid>{E4554146-B7EA-464B-9EE2-5923F7721E64}</ProjectGuid>
<OutputType>Library</OutputType>
<AppDesignerFolder>Properties</AppDesignerFolder>
<RootNamespace>Jieba.Net</RootNamespace>
<AssemblyName>Jieba.Net</AssemblyName>
<TargetFrameworkVersion>v4.5</TargetFrameworkVersion>
<FileAlignment>512</FileAlignment>
<Deterministic>true</Deterministic>
<TargetFrameworkProfile />
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
<DebugSymbols>true</DebugSymbols>
<DebugType>full</DebugType>
<Optimize>false</Optimize>
<OutputPath>bin\Debug\</OutputPath>
<DefineConstants>DEBUG;TRACE</DefineConstants>
<ErrorReport>prompt</ErrorReport>
<WarningLevel>4</WarningLevel>
<Prefer32Bit>false</Prefer32Bit>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
<DebugType>pdbonly</DebugType>
<Optimize>true</Optimize>
<OutputPath>bin\Release\</OutputPath>
<DefineConstants>TRACE</DefineConstants>
<ErrorReport>prompt</ErrorReport>
<WarningLevel>4</WarningLevel>
<Prefer32Bit>false</Prefer32Bit>
</PropertyGroup>
<ItemGroup>
<Reference Include="Kivii.Common.V4.0, Version=5.6.2021.5080, Culture=neutral, processorArchitecture=MSIL">
<HintPath>..\packages\Kivii.Common.5.6.2021.5080\lib\net40\Kivii.Common.V4.0.dll</HintPath>
</Reference>
<Reference Include="System" />
<Reference Include="System.Core" />
<Reference Include="System.Data" />
</ItemGroup>
<ItemGroup>
<Compile Include="Analyser\IdfLoader.cs" />
<Compile Include="Analyser\KeywordExtractor.cs" />
<Compile Include="Analyser\TextRankExtractor.cs" />
<Compile Include="Analyser\TfidfExtractor.cs" />
<Compile Include="Analyser\UndirectWeightedGraph.cs" />
<Compile Include="LemmaSharp\Classes\7zip\Common\CommandLineParser.cs" />
<Compile Include="LemmaSharp\Classes\7zip\Common\CRC.cs" />
<Compile Include="LemmaSharp\Classes\7zip\Common\InBuffer.cs" />
<Compile Include="LemmaSharp\Classes\7zip\Common\OutBuffer.cs" />
<Compile Include="LemmaSharp\Classes\7zip\Compress\LZMA\LzmaBase.cs" />
<Compile Include="LemmaSharp\Classes\7zip\Compress\LZMA\LzmaDecoder.cs" />
<Compile Include="LemmaSharp\Classes\7zip\Compress\LZMA\LzmaEncoder.cs" />
<Compile Include="LemmaSharp\Classes\7zip\Compress\LZ\IMatchFinder.cs" />
<Compile Include="LemmaSharp\Classes\7zip\Compress\LZ\LzBinTree.cs" />
<Compile Include="LemmaSharp\Classes\7zip\Compress\LZ\LzInWindow.cs" />
<Compile Include="LemmaSharp\Classes\7zip\Compress\LZ\LzOutWindow.cs" />
<Compile Include="LemmaSharp\Classes\7zip\Compress\RangeCoder\RangeCoder.cs" />
<Compile Include="LemmaSharp\Classes\7zip\Compress\RangeCoder\RangeCoderBit.cs" />
<Compile Include="LemmaSharp\Classes\7zip\Compress\RangeCoder\RangeCoderBitTree.cs" />
<Compile Include="LemmaSharp\Classes\7zip\ICoder.cs" />
<Compile Include="LemmaSharp\Classes\Constants.cs" />
<Compile Include="LemmaSharp\Classes\ExampleList.cs" />
<Compile Include="LemmaSharp\Classes\LemmaExample.cs" />
<Compile Include="LemmaSharp\Classes\LemmaRule.cs" />
<Compile Include="LemmaSharp\Classes\Lemmatizer.cs" />
<Compile Include="LemmaSharp\Classes\LemmatizerSettings.cs" />
<Compile Include="LemmaSharp\Classes\LemmaTreeNode.cs" />
<Compile Include="LemmaSharp\Classes\RuleList.cs" />
<Compile Include="LemmaSharp\Classes\RuleWeighted.cs" />
<Compile Include="LemmaSharp\Interfaces\ILemmatizer.cs" />
<Compile Include="LemmaSharp\Interfaces\ILemmatizerModel.cs" />
<Compile Include="LemmaSharp\Interfaces\ILemmatizerTrainable.cs" />
<Compile Include="LemmaSharp\LatinoCompatibility\BinarySerializer.cs" />
<Compile Include="Properties\AssemblyInfo.cs" />
<Compile Include="Segmenter\Common\Counter.cs" />
<Compile Include="Segmenter\Common\Extensions.cs" />
<Compile Include="Segmenter\Common\KeywordTrie.cs" />
<Compile Include="Segmenter\Common\Trie.cs" />
<Compile Include="Segmenter\ConfigManager.cs" />
<Compile Include="Segmenter\Constants.cs" />
<Compile Include="Segmenter\DefaultDictionary.cs" />
<Compile Include="Segmenter\FinalSeg\IFinalSeg.cs" />
<Compile Include="Segmenter\FinalSeg\Viterbi.cs" />
<Compile Include="Segmenter\JiebaSegmenter.cs" />
<Compile Include="Segmenter\KeywordProcessor.cs" />
<Compile Include="Segmenter\Node.cs" />
<Compile Include="Segmenter\Pair.cs" />
<Compile Include="Segmenter\PosSeg\Pair.cs" />
<Compile Include="Segmenter\PosSeg\PosSegmenter.cs" />
<Compile Include="Segmenter\PosSeg\Viterbi.cs" />
<Compile Include="Segmenter\Spelling\SpellChecker.cs" />
<Compile Include="Segmenter\Token.cs" />
<Compile Include="Segmenter\WordDictionary.cs" />
</ItemGroup>
<ItemGroup>
<None Include="packages.config" />
<EmbeddedResource Include="Segmenter\Resources\pos_prob_emit.json" />
<EmbeddedResource Include="Segmenter\Resources\pos_prob_trans.json" />
<EmbeddedResource Include="Segmenter\Resources\prob_emit.json" />
<EmbeddedResource Include="Segmenter\Resources\prob_trans.json" />
</ItemGroup>
<ItemGroup>
<EmbeddedResource Include="Segmenter\Resources\dict.txt" />
</ItemGroup>
<ItemGroup />
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
</Project>
\ No newline at end of file
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<ProjectView>ProjectFiles</ProjectView>
</PropertyGroup>
</Project>
\ No newline at end of file
// Common/CRC.cs
namespace SevenZip
{
class CRC
{
public static readonly uint[] Table;
static CRC()
{
Table = new uint[256];
const uint kPoly = 0xEDB88320;
for (uint i = 0; i < 256; i++)
{
uint r = i;
for (int j = 0; j < 8; j++)
if ((r & 1) != 0)
r = (r >> 1) ^ kPoly;
else
r >>= 1;
Table[i] = r;
}
}
uint _value = 0xFFFFFFFF;
public void Init() { _value = 0xFFFFFFFF; }
public void UpdateByte(byte b)
{
_value = Table[(((byte)(_value)) ^ b)] ^ (_value >> 8);
}
public void Update(byte[] data, uint offset, uint size)
{
for (uint i = 0; i < size; i++)
_value = Table[(((byte)(_value)) ^ data[offset + i])] ^ (_value >> 8);
}
public uint GetDigest() { return _value ^ 0xFFFFFFFF; }
static uint CalculateDigest(byte[] data, uint offset, uint size)
{
CRC crc = new CRC();
// crc.Init();
crc.Update(data, offset, size);
return crc.GetDigest();
}
static bool VerifyDigest(uint digest, byte[] data, uint offset, uint size)
{
return (CalculateDigest(data, offset, size) == digest);
}
}
}
// CommandLineParser.cs
using System;
using System.Collections;
namespace SevenZip.CommandLineParser
{
public enum SwitchType
{
Simple,
PostMinus,
LimitedPostString,
UnLimitedPostString,
PostChar
}
public class SwitchForm
{
public string IDString;
public SwitchType Type;
public bool Multi;
public int MinLen;
public int MaxLen;
public string PostCharSet;
public SwitchForm(string idString, SwitchType type, bool multi,
int minLen, int maxLen, string postCharSet)
{
IDString = idString;
Type = type;
Multi = multi;
MinLen = minLen;
MaxLen = maxLen;
PostCharSet = postCharSet;
}
public SwitchForm(string idString, SwitchType type, bool multi, int minLen):
this(idString, type, multi, minLen, 0, "")
{
}
public SwitchForm(string idString, SwitchType type, bool multi):
this(idString, type, multi, 0)
{
}
}
public class SwitchResult
{
public bool ThereIs;
public bool WithMinus;
public ArrayList PostStrings = new ArrayList();
public int PostCharIndex;
public SwitchResult()
{
ThereIs = false;
}
}
public class Parser
{
public ArrayList NonSwitchStrings = new ArrayList();
SwitchResult[] _switches;
public Parser(int numSwitches)
{
_switches = new SwitchResult[numSwitches];
for (int i = 0; i < numSwitches; i++)
_switches[i] = new SwitchResult();
}
bool ParseString(string srcString, SwitchForm[] switchForms)
{
int len = srcString.Length;
if (len == 0)
return false;
int pos = 0;
if (!IsItSwitchChar(srcString[pos]))
return false;
while (pos < len)
{
if (IsItSwitchChar(srcString[pos]))
pos++;
const int kNoLen = -1;
int matchedSwitchIndex = 0;
int maxLen = kNoLen;
for (int switchIndex = 0; switchIndex < _switches.Length; switchIndex++)
{
int switchLen = switchForms[switchIndex].IDString.Length;
if (switchLen <= maxLen || pos + switchLen > len)
continue;
if (String.Compare(switchForms[switchIndex].IDString, 0,
srcString, pos, switchLen, true) == 0)
{
matchedSwitchIndex = switchIndex;
maxLen = switchLen;
}
}
if (maxLen == kNoLen)
throw new Exception("maxLen == kNoLen");
SwitchResult matchedSwitch = _switches[matchedSwitchIndex];
SwitchForm switchForm = switchForms[matchedSwitchIndex];
if ((!switchForm.Multi) && matchedSwitch.ThereIs)
throw new Exception("switch must be single");
matchedSwitch.ThereIs = true;
pos += maxLen;
int tailSize = len - pos;
SwitchType type = switchForm.Type;
switch (type)
{
case SwitchType.PostMinus:
{
if (tailSize == 0)
matchedSwitch.WithMinus = false;
else
{
matchedSwitch.WithMinus = (srcString[pos] == kSwitchMinus);
if (matchedSwitch.WithMinus)
pos++;
}
break;
}
case SwitchType.PostChar:
{
if (tailSize < switchForm.MinLen)
throw new Exception("switch is not full");
string charSet = switchForm.PostCharSet;
const int kEmptyCharValue = -1;
if (tailSize == 0)
matchedSwitch.PostCharIndex = kEmptyCharValue;
else
{
int index = charSet.IndexOf(srcString[pos]);
if (index < 0)
matchedSwitch.PostCharIndex = kEmptyCharValue;
else
{
matchedSwitch.PostCharIndex = index;
pos++;
}
}
break;
}
case SwitchType.LimitedPostString:
case SwitchType.UnLimitedPostString:
{
int minLen = switchForm.MinLen;
if (tailSize < minLen)
throw new Exception("switch is not full");
if (type == SwitchType.UnLimitedPostString)
{
matchedSwitch.PostStrings.Add(srcString.Substring(pos));
return true;
}
String stringSwitch = srcString.Substring(pos, minLen);
pos += minLen;
for (int i = minLen; i < switchForm.MaxLen && pos < len; i++, pos++)
{
char c = srcString[pos];
if (IsItSwitchChar(c))
break;
stringSwitch += c;
}
matchedSwitch.PostStrings.Add(stringSwitch);
break;
}
}
}
return true;
}
public void ParseStrings(SwitchForm[] switchForms, string[] commandStrings)
{
int numCommandStrings = commandStrings.Length;
bool stopSwitch = false;
for (int i = 0; i < numCommandStrings; i++)
{
string s = commandStrings[i];
if (stopSwitch)
NonSwitchStrings.Add(s);
else
if (s == kStopSwitchParsing)
stopSwitch = true;
else
if (!ParseString(s, switchForms))
NonSwitchStrings.Add(s);
}
}
public SwitchResult this[int index] { get { return _switches[index]; } }
public static int ParseCommand(CommandForm[] commandForms, string commandString,
out string postString)
{
for (int i = 0; i < commandForms.Length; i++)
{
string id = commandForms[i].IDString;
if (commandForms[i].PostStringMode)
{
if (commandString.IndexOf(id) == 0)
{
postString = commandString.Substring(id.Length);
return i;
}
}
else
if (commandString == id)
{
postString = "";
return i;
}
}
postString = "";
return -1;
}
static bool ParseSubCharsCommand(int numForms, CommandSubCharsSet[] forms,
string commandString, ArrayList indices)
{
indices.Clear();
int numUsedChars = 0;
for (int i = 0; i < numForms; i++)
{
CommandSubCharsSet charsSet = forms[i];
int currentIndex = -1;
int len = charsSet.Chars.Length;
for (int j = 0; j < len; j++)
{
char c = charsSet.Chars[j];
int newIndex = commandString.IndexOf(c);
if (newIndex >= 0)
{
if (currentIndex >= 0)
return false;
if (commandString.IndexOf(c, newIndex + 1) >= 0)
return false;
currentIndex = j;
numUsedChars++;
}
}
if (currentIndex == -1 && !charsSet.EmptyAllowed)
return false;
indices.Add(currentIndex);
}
return (numUsedChars == commandString.Length);
}
const char kSwitchID1 = '-';
const char kSwitchID2 = '/';
const char kSwitchMinus = '-';
const string kStopSwitchParsing = "--";
static bool IsItSwitchChar(char c)
{
return (c == kSwitchID1 || c == kSwitchID2);
}
}
public class CommandForm
{
public string IDString = "";
public bool PostStringMode = false;
public CommandForm(string idString, bool postStringMode)
{
IDString = idString;
PostStringMode = postStringMode;
}
}
class CommandSubCharsSet
{
public string Chars = "";
public bool EmptyAllowed = false;
}
}
// InBuffer.cs
namespace SevenZip.Buffer
{
public class InBuffer
{
byte[] m_Buffer;
uint m_Pos;
uint m_Limit;
uint m_BufferSize;
System.IO.Stream m_Stream;
bool m_StreamWasExhausted;
ulong m_ProcessedSize;
public InBuffer(uint bufferSize)
{
m_Buffer = new byte[bufferSize];
m_BufferSize = bufferSize;
}
public void Init(System.IO.Stream stream)
{
m_Stream = stream;
m_ProcessedSize = 0;
m_Limit = 0;
m_Pos = 0;
m_StreamWasExhausted = false;
}
public bool ReadBlock()
{
if (m_StreamWasExhausted)
return false;
m_ProcessedSize += m_Pos;
int aNumProcessedBytes = m_Stream.Read(m_Buffer, 0, (int)m_BufferSize);
m_Pos = 0;
m_Limit = (uint)aNumProcessedBytes;
m_StreamWasExhausted = (aNumProcessedBytes == 0);
return (!m_StreamWasExhausted);
}
public void ReleaseStream()
{
// m_Stream.Close();
m_Stream = null;
}
public bool ReadByte(byte b) // check it
{
if (m_Pos >= m_Limit)
if (!ReadBlock())
return false;
b = m_Buffer[m_Pos++];
return true;
}
public byte ReadByte()
{
// return (byte)m_Stream.ReadByte();
if (m_Pos >= m_Limit)
if (!ReadBlock())
return 0xFF;
return m_Buffer[m_Pos++];
}
public ulong GetProcessedSize()
{
return m_ProcessedSize + m_Pos;
}
}
}
// OutBuffer.cs
namespace SevenZip.Buffer
{
public class OutBuffer
{
byte[] m_Buffer;
uint m_Pos;
uint m_BufferSize;
System.IO.Stream m_Stream;
ulong m_ProcessedSize;
public OutBuffer(uint bufferSize)
{
m_Buffer = new byte[bufferSize];
m_BufferSize = bufferSize;
}
public void SetStream(System.IO.Stream stream) { m_Stream = stream; }
public void FlushStream() { m_Stream.Flush(); }
public void CloseStream() { m_Stream.Close(); }
public void ReleaseStream() { m_Stream = null; }
public void Init()
{
m_ProcessedSize = 0;
m_Pos = 0;
}
public void WriteByte(byte b)
{
m_Buffer[m_Pos++] = b;
if (m_Pos >= m_BufferSize)
FlushData();
}
public void FlushData()
{
if (m_Pos == 0)
return;
m_Stream.Write(m_Buffer, 0, (int)m_Pos);
m_Pos = 0;
}
public ulong GetProcessedSize() { return m_ProcessedSize + m_Pos; }
}
}
// IMatchFinder.cs
using System;
namespace SevenZip.Compression.LZ
{
interface IInWindowStream
{
void SetStream(System.IO.Stream inStream);
void Init();
void ReleaseStream();
Byte GetIndexByte(Int32 index);
UInt32 GetMatchLen(Int32 index, UInt32 distance, UInt32 limit);
UInt32 GetNumAvailableBytes();
}
interface IMatchFinder : IInWindowStream
{
void Create(UInt32 historySize, UInt32 keepAddBufferBefore,
UInt32 matchMaxLen, UInt32 keepAddBufferAfter);
UInt32 GetMatches(UInt32[] distances);
void Skip(UInt32 num);
}
}
// LzBinTree.cs
using System;
namespace SevenZip.Compression.LZ
{
public class BinTree : InWindow, IMatchFinder
{
UInt32 _cyclicBufferPos;
UInt32 _cyclicBufferSize = 0;
UInt32 _matchMaxLen;
UInt32[] _son;
UInt32[] _hash;
UInt32 _cutValue = 0xFF;
UInt32 _hashMask;
UInt32 _hashSizeSum = 0;
bool HASH_ARRAY = true;
const UInt32 kHash2Size = 1 << 10;
const UInt32 kHash3Size = 1 << 16;
const UInt32 kBT2HashSize = 1 << 16;
const UInt32 kStartMaxLen = 1;
const UInt32 kHash3Offset = kHash2Size;
const UInt32 kEmptyHashValue = 0;
const UInt32 kMaxValForNormalize = ((UInt32)1 << 31) - 1;
UInt32 kNumHashDirectBytes = 0;
UInt32 kMinMatchCheck = 4;
UInt32 kFixHashSize = kHash2Size + kHash3Size;
public void SetType(int numHashBytes)
{
HASH_ARRAY = (numHashBytes > 2);
if (HASH_ARRAY)
{
kNumHashDirectBytes = 0;
kMinMatchCheck = 4;
kFixHashSize = kHash2Size + kHash3Size;
}
else
{
kNumHashDirectBytes = 2;
kMinMatchCheck = 2 + 1;
kFixHashSize = 0;
}
}
public new void SetStream(System.IO.Stream stream) { base.SetStream(stream); }
public new void ReleaseStream() { base.ReleaseStream(); }
public new void Init()
{
base.Init();
for (UInt32 i = 0; i < _hashSizeSum; i++)
_hash[i] = kEmptyHashValue;
_cyclicBufferPos = 0;
ReduceOffsets(-1);
}
public new void MovePos()
{
if (++_cyclicBufferPos >= _cyclicBufferSize)
_cyclicBufferPos = 0;
base.MovePos();
if (_pos == kMaxValForNormalize)
Normalize();
}
public new Byte GetIndexByte(Int32 index) { return base.GetIndexByte(index); }
public new UInt32 GetMatchLen(Int32 index, UInt32 distance, UInt32 limit)
{ return base.GetMatchLen(index, distance, limit); }
public new UInt32 GetNumAvailableBytes() { return base.GetNumAvailableBytes(); }
public void Create(UInt32 historySize, UInt32 keepAddBufferBefore,
UInt32 matchMaxLen, UInt32 keepAddBufferAfter)
{
if (historySize > kMaxValForNormalize - 256)
throw new Exception();
_cutValue = 16 + (matchMaxLen >> 1);
UInt32 windowReservSize = (historySize + keepAddBufferBefore +
matchMaxLen + keepAddBufferAfter) / 2 + 256;
base.Create(historySize + keepAddBufferBefore, matchMaxLen + keepAddBufferAfter, windowReservSize);
_matchMaxLen = matchMaxLen;
UInt32 cyclicBufferSize = historySize + 1;
if (_cyclicBufferSize != cyclicBufferSize)
_son = new UInt32[(_cyclicBufferSize = cyclicBufferSize) * 2];
UInt32 hs = kBT2HashSize;
if (HASH_ARRAY)
{
hs = historySize - 1;
hs |= (hs >> 1);
hs |= (hs >> 2);
hs |= (hs >> 4);
hs |= (hs >> 8);
hs >>= 1;
hs |= 0xFFFF;
if (hs > (1 << 24))
hs >>= 1;
_hashMask = hs;
hs++;
hs += kFixHashSize;
}
if (hs != _hashSizeSum)
_hash = new UInt32[_hashSizeSum = hs];
}
public UInt32 GetMatches(UInt32[] distances)
{
UInt32 lenLimit;
if (_pos + _matchMaxLen <= _streamPos)
lenLimit = _matchMaxLen;
else
{
lenLimit = _streamPos - _pos;
if (lenLimit < kMinMatchCheck)
{
MovePos();
return 0;
}
}
UInt32 offset = 0;
UInt32 matchMinPos = (_pos > _cyclicBufferSize) ? (_pos - _cyclicBufferSize) : 0;
UInt32 cur = _bufferOffset + _pos;
UInt32 maxLen = kStartMaxLen; // to avoid items for len < hashSize;
UInt32 hashValue, hash2Value = 0, hash3Value = 0;
if (HASH_ARRAY)
{
UInt32 temp = CRC.Table[_bufferBase[cur]] ^ _bufferBase[cur + 1];
hash2Value = temp & (kHash2Size - 1);
temp ^= ((UInt32)(_bufferBase[cur + 2]) << 8);
hash3Value = temp & (kHash3Size - 1);
hashValue = (temp ^ (CRC.Table[_bufferBase[cur + 3]] << 5)) & _hashMask;
}
else
hashValue = _bufferBase[cur] ^ ((UInt32)(_bufferBase[cur + 1]) << 8);
UInt32 curMatch = _hash[kFixHashSize + hashValue];
if (HASH_ARRAY)
{
UInt32 curMatch2 = _hash[hash2Value];
UInt32 curMatch3 = _hash[kHash3Offset + hash3Value];
_hash[hash2Value] = _pos;
_hash[kHash3Offset + hash3Value] = _pos;
if (curMatch2 > matchMinPos)
if (_bufferBase[_bufferOffset + curMatch2] == _bufferBase[cur])
{
distances[offset++] = maxLen = 2;
distances[offset++] = _pos - curMatch2 - 1;
}
if (curMatch3 > matchMinPos)
if (_bufferBase[_bufferOffset + curMatch3] == _bufferBase[cur])
{
if (curMatch3 == curMatch2)
offset -= 2;
distances[offset++] = maxLen = 3;
distances[offset++] = _pos - curMatch3 - 1;
curMatch2 = curMatch3;
}
if (offset != 0 && curMatch2 == curMatch)
{
offset -= 2;
maxLen = kStartMaxLen;
}
}
_hash[kFixHashSize + hashValue] = _pos;
UInt32 ptr0 = (_cyclicBufferPos << 1) + 1;
UInt32 ptr1 = (_cyclicBufferPos << 1);
UInt32 len0, len1;
len0 = len1 = kNumHashDirectBytes;
if (kNumHashDirectBytes != 0)
{
if (curMatch > matchMinPos)
{
if (_bufferBase[_bufferOffset + curMatch + kNumHashDirectBytes] !=
_bufferBase[cur + kNumHashDirectBytes])
{
distances[offset++] = maxLen = kNumHashDirectBytes;
distances[offset++] = _pos - curMatch - 1;
}
}
}
UInt32 count = _cutValue;
while(true)
{
if(curMatch <= matchMinPos || count-- == 0)
{
_son[ptr0] = _son[ptr1] = kEmptyHashValue;
break;
}
UInt32 delta = _pos - curMatch;
UInt32 cyclicPos = ((delta <= _cyclicBufferPos) ?
(_cyclicBufferPos - delta) :
(_cyclicBufferPos - delta + _cyclicBufferSize)) << 1;
UInt32 pby1 = _bufferOffset + curMatch;
UInt32 len = Math.Min(len0, len1);
if (_bufferBase[pby1 + len] == _bufferBase[cur + len])
{
while(++len != lenLimit)
if (_bufferBase[pby1 + len] != _bufferBase[cur + len])
break;
if (maxLen < len)
{
distances[offset++] = maxLen = len;
distances[offset++] = delta - 1;
if (len == lenLimit)
{
_son[ptr1] = _son[cyclicPos];
_son[ptr0] = _son[cyclicPos + 1];
break;
}
}
}
if (_bufferBase[pby1 + len] < _bufferBase[cur + len])
{
_son[ptr1] = curMatch;
ptr1 = cyclicPos + 1;
curMatch = _son[ptr1];
len1 = len;
}
else
{
_son[ptr0] = curMatch;
ptr0 = cyclicPos;
curMatch = _son[ptr0];
len0 = len;
}
}
MovePos();
return offset;
}
public void Skip(UInt32 num)
{
do
{
UInt32 lenLimit;
if (_pos + _matchMaxLen <= _streamPos)
lenLimit = _matchMaxLen;
else
{
lenLimit = _streamPos - _pos;
if (lenLimit < kMinMatchCheck)
{
MovePos();
continue;
}
}
UInt32 matchMinPos = (_pos > _cyclicBufferSize) ? (_pos - _cyclicBufferSize) : 0;
UInt32 cur = _bufferOffset + _pos;
UInt32 hashValue;
if (HASH_ARRAY)
{
UInt32 temp = CRC.Table[_bufferBase[cur]] ^ _bufferBase[cur + 1];
UInt32 hash2Value = temp & (kHash2Size - 1);
_hash[hash2Value] = _pos;
temp ^= ((UInt32)(_bufferBase[cur + 2]) << 8);
UInt32 hash3Value = temp & (kHash3Size - 1);
_hash[kHash3Offset + hash3Value] = _pos;
hashValue = (temp ^ (CRC.Table[_bufferBase[cur + 3]] << 5)) & _hashMask;
}
else
hashValue = _bufferBase[cur] ^ ((UInt32)(_bufferBase[cur + 1]) << 8);
UInt32 curMatch = _hash[kFixHashSize + hashValue];
_hash[kFixHashSize + hashValue] = _pos;
UInt32 ptr0 = (_cyclicBufferPos << 1) + 1;
UInt32 ptr1 = (_cyclicBufferPos << 1);
UInt32 len0, len1;
len0 = len1 = kNumHashDirectBytes;
UInt32 count = _cutValue;
while (true)
{
if (curMatch <= matchMinPos || count-- == 0)
{
_son[ptr0] = _son[ptr1] = kEmptyHashValue;
break;
}
UInt32 delta = _pos - curMatch;
UInt32 cyclicPos = ((delta <= _cyclicBufferPos) ?
(_cyclicBufferPos - delta) :
(_cyclicBufferPos - delta + _cyclicBufferSize)) << 1;
UInt32 pby1 = _bufferOffset + curMatch;
UInt32 len = Math.Min(len0, len1);
if (_bufferBase[pby1 + len] == _bufferBase[cur + len])
{
while (++len != lenLimit)
if (_bufferBase[pby1 + len] != _bufferBase[cur + len])
break;
if (len == lenLimit)
{
_son[ptr1] = _son[cyclicPos];
_son[ptr0] = _son[cyclicPos + 1];
break;
}
}
if (_bufferBase[pby1 + len] < _bufferBase[cur + len])
{
_son[ptr1] = curMatch;
ptr1 = cyclicPos + 1;
curMatch = _son[ptr1];
len1 = len;
}
else
{
_son[ptr0] = curMatch;
ptr0 = cyclicPos;
curMatch = _son[ptr0];
len0 = len;
}
}
MovePos();
}
while (--num != 0);
}
void NormalizeLinks(UInt32[] items, UInt32 numItems, UInt32 subValue)
{
for (UInt32 i = 0; i < numItems; i++)
{
UInt32 value = items[i];
if (value <= subValue)
value = kEmptyHashValue;
else
value -= subValue;
items[i] = value;
}
}
void Normalize()
{
UInt32 subValue = _pos - _cyclicBufferSize;
NormalizeLinks(_son, _cyclicBufferSize * 2, subValue);
NormalizeLinks(_hash, _hashSizeSum, subValue);
ReduceOffsets((Int32)subValue);
}
public void SetCutValue(UInt32 cutValue) { _cutValue = cutValue; }
}
}
// LzInWindow.cs
using System;
namespace SevenZip.Compression.LZ
{
public class InWindow
{
public Byte[] _bufferBase = null; // pointer to buffer with data
System.IO.Stream _stream;
UInt32 _posLimit; // offset (from _buffer) of first byte when new block reading must be done
bool _streamEndWasReached; // if (true) then _streamPos shows real end of stream
UInt32 _pointerToLastSafePosition;
public UInt32 _bufferOffset;
public UInt32 _blockSize; // Size of Allocated memory block
public UInt32 _pos; // offset (from _buffer) of curent byte
UInt32 _keepSizeBefore; // how many BYTEs must be kept in buffer before _pos
UInt32 _keepSizeAfter; // how many BYTEs must be kept buffer after _pos
public UInt32 _streamPos; // offset (from _buffer) of first not read byte from Stream
public void MoveBlock()
{
UInt32 offset = (UInt32)(_bufferOffset) + _pos - _keepSizeBefore;
// we need one additional byte, since MovePos moves on 1 byte.
if (offset > 0)
offset--;
UInt32 numBytes = (UInt32)(_bufferOffset) + _streamPos - offset;
// check negative offset ????
for (UInt32 i = 0; i < numBytes; i++)
_bufferBase[i] = _bufferBase[offset + i];
_bufferOffset -= offset;
}
public virtual void ReadBlock()
{
if (_streamEndWasReached)
return;
while (true)
{
int size = (int)((0 - _bufferOffset) + _blockSize - _streamPos);
if (size == 0)
return;
int numReadBytes = _stream.Read(_bufferBase, (int)(_bufferOffset + _streamPos), size);
if (numReadBytes == 0)
{
_posLimit = _streamPos;
UInt32 pointerToPostion = _bufferOffset + _posLimit;
if (pointerToPostion > _pointerToLastSafePosition)
_posLimit = (UInt32)(_pointerToLastSafePosition - _bufferOffset);
_streamEndWasReached = true;
return;
}
_streamPos += (UInt32)numReadBytes;
if (_streamPos >= _pos + _keepSizeAfter)
_posLimit = _streamPos - _keepSizeAfter;
}
}
void Free() { _bufferBase = null; }
public void Create(UInt32 keepSizeBefore, UInt32 keepSizeAfter, UInt32 keepSizeReserv)
{
_keepSizeBefore = keepSizeBefore;
_keepSizeAfter = keepSizeAfter;
UInt32 blockSize = keepSizeBefore + keepSizeAfter + keepSizeReserv;
if (_bufferBase == null || _blockSize != blockSize)
{
Free();
_blockSize = blockSize;
_bufferBase = new Byte[_blockSize];
}
_pointerToLastSafePosition = _blockSize - keepSizeAfter;
}
public void SetStream(System.IO.Stream stream) { _stream = stream; }
public void ReleaseStream() { _stream = null; }
public void Init()
{
_bufferOffset = 0;
_pos = 0;
_streamPos = 0;
_streamEndWasReached = false;
ReadBlock();
}
public void MovePos()
{
_pos++;
if (_pos > _posLimit)
{
UInt32 pointerToPostion = _bufferOffset + _pos;
if (pointerToPostion > _pointerToLastSafePosition)
MoveBlock();
ReadBlock();
}
}
public Byte GetIndexByte(Int32 index) { return _bufferBase[_bufferOffset + _pos + index]; }
// index + limit have not to exceed _keepSizeAfter;
public UInt32 GetMatchLen(Int32 index, UInt32 distance, UInt32 limit)
{
if (_streamEndWasReached)
if ((_pos + index) + limit > _streamPos)
limit = _streamPos - (UInt32)(_pos + index);
distance++;
// Byte *pby = _buffer + (size_t)_pos + index;
UInt32 pby = _bufferOffset + _pos + (UInt32)index;
UInt32 i;
for (i = 0; i < limit && _bufferBase[pby + i] == _bufferBase[pby + i - distance]; i++);
return i;
}
public UInt32 GetNumAvailableBytes() { return _streamPos - _pos; }
public void ReduceOffsets(Int32 subValue)
{
_bufferOffset += (UInt32)subValue;
_posLimit -= (UInt32)subValue;
_pos -= (UInt32)subValue;
_streamPos -= (UInt32)subValue;
}
}
}
// LzOutWindow.cs
namespace SevenZip.Compression.LZ
{
public class OutWindow
{
byte[] _buffer = null;
uint _pos;
uint _windowSize = 0;
uint _streamPos;
System.IO.Stream _stream;
public uint TrainSize = 0;
public void Create(uint windowSize)
{
if (_windowSize != windowSize)
{
// System.GC.Collect();
_buffer = new byte[windowSize];
}
_windowSize = windowSize;
_pos = 0;
_streamPos = 0;
}
public void Init(System.IO.Stream stream, bool solid)
{
ReleaseStream();
_stream = stream;
if (!solid)
{
_streamPos = 0;
_pos = 0;
TrainSize = 0;
}
}
public bool Train(System.IO.Stream stream)
{
long len = stream.Length;
uint size = (len < _windowSize) ? (uint)len : _windowSize;
TrainSize = size;
stream.Position = len - size;
_streamPos = _pos = 0;
while (size > 0)
{
uint curSize = _windowSize - _pos;
if (size < curSize)
curSize = size;
int numReadBytes = stream.Read(_buffer, (int)_pos, (int)curSize);
if (numReadBytes == 0)
return false;
size -= (uint)numReadBytes;
_pos += (uint)numReadBytes;
_streamPos += (uint)numReadBytes;
if (_pos == _windowSize)
_streamPos = _pos = 0;
}
return true;
}
public void ReleaseStream()
{
Flush();
_stream = null;
}
public void Flush()
{
uint size = _pos - _streamPos;
if (size == 0)
return;
_stream.Write(_buffer, (int)_streamPos, (int)size);
if (_pos >= _windowSize)
_pos = 0;
_streamPos = _pos;
}
public void CopyBlock(uint distance, uint len)
{
uint pos = _pos - distance - 1;
if (pos >= _windowSize)
pos += _windowSize;
for (; len > 0; len--)
{
if (pos >= _windowSize)
pos = 0;
_buffer[_pos++] = _buffer[pos++];
if (_pos >= _windowSize)
Flush();
}
}
public void PutByte(byte b)
{
_buffer[_pos++] = b;
if (_pos >= _windowSize)
Flush();
}
public byte GetByte(uint distance)
{
uint pos = _pos - distance - 1;
if (pos >= _windowSize)
pos += _windowSize;
return _buffer[pos];
}
}
}
// LzmaBase.cs
namespace SevenZip.Compression.LZMA
{
internal abstract class Base
{
public const uint kNumRepDistances = 4;
public const uint kNumStates = 12;
// static byte []kLiteralNextStates = {0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 4, 5};
// static byte []kMatchNextStates = {7, 7, 7, 7, 7, 7, 7, 10, 10, 10, 10, 10};
// static byte []kRepNextStates = {8, 8, 8, 8, 8, 8, 8, 11, 11, 11, 11, 11};
// static byte []kShortRepNextStates = {9, 9, 9, 9, 9, 9, 9, 11, 11, 11, 11, 11};
public struct State
{
public uint Index;
public void Init() { Index = 0; }
public void UpdateChar()
{
if (Index < 4) Index = 0;
else if (Index < 10) Index -= 3;
else Index -= 6;
}
public void UpdateMatch() { Index = (uint)(Index < 7 ? 7 : 10); }
public void UpdateRep() { Index = (uint)(Index < 7 ? 8 : 11); }
public void UpdateShortRep() { Index = (uint)(Index < 7 ? 9 : 11); }
public bool IsCharState() { return Index < 7; }
}
public const int kNumPosSlotBits = 6;
public const int kDicLogSizeMin = 0;
// public const int kDicLogSizeMax = 30;
// public const uint kDistTableSizeMax = kDicLogSizeMax * 2;
public const int kNumLenToPosStatesBits = 2; // it's for speed optimization
public const uint kNumLenToPosStates = 1 << kNumLenToPosStatesBits;
public const uint kMatchMinLen = 2;
public static uint GetLenToPosState(uint len)
{
len -= kMatchMinLen;
if (len < kNumLenToPosStates)
return len;
return (uint)(kNumLenToPosStates - 1);
}
public const int kNumAlignBits = 4;
public const uint kAlignTableSize = 1 << kNumAlignBits;
public const uint kAlignMask = (kAlignTableSize - 1);
public const uint kStartPosModelIndex = 4;
public const uint kEndPosModelIndex = 14;
public const uint kNumPosModels = kEndPosModelIndex - kStartPosModelIndex;
public const uint kNumFullDistances = 1 << ((int)kEndPosModelIndex / 2);
public const uint kNumLitPosStatesBitsEncodingMax = 4;
public const uint kNumLitContextBitsMax = 8;
public const int kNumPosStatesBitsMax = 4;
public const uint kNumPosStatesMax = (1 << kNumPosStatesBitsMax);
public const int kNumPosStatesBitsEncodingMax = 4;
public const uint kNumPosStatesEncodingMax = (1 << kNumPosStatesBitsEncodingMax);
public const int kNumLowLenBits = 3;
public const int kNumMidLenBits = 3;
public const int kNumHighLenBits = 8;
public const uint kNumLowLenSymbols = 1 << kNumLowLenBits;
public const uint kNumMidLenSymbols = 1 << kNumMidLenBits;
public const uint kNumLenSymbols = kNumLowLenSymbols + kNumMidLenSymbols +
(1 << kNumHighLenBits);
public const uint kMatchMaxLen = kMatchMinLen + kNumLenSymbols - 1;
}
}
using System;
namespace SevenZip.Compression.RangeCoder
{
class Encoder
{
public const uint kTopValue = (1 << 24);
System.IO.Stream Stream;
public UInt64 Low;
public uint Range;
uint _cacheSize;
byte _cache;
long StartPosition;
public void SetStream(System.IO.Stream stream)
{
Stream = stream;
}
public void ReleaseStream()
{
Stream = null;
}
public void Init()
{
StartPosition = Stream.Position;
Low = 0;
Range = 0xFFFFFFFF;
_cacheSize = 1;
_cache = 0;
}
public void FlushData()
{
for (int i = 0; i < 5; i++)
ShiftLow();
}
public void FlushStream()
{
Stream.Flush();
}
public void CloseStream()
{
Stream.Close();
}
public void Encode(uint start, uint size, uint total)
{
Low += start * (Range /= total);
Range *= size;
while (Range < kTopValue)
{
Range <<= 8;
ShiftLow();
}
}
public void ShiftLow()
{
if ((uint)Low < (uint)0xFF000000 || (uint)(Low >> 32) == 1)
{
byte temp = _cache;
do
{
Stream.WriteByte((byte)(temp + (Low >> 32)));
temp = 0xFF;
}
while (--_cacheSize != 0);
_cache = (byte)(((uint)Low) >> 24);
}
_cacheSize++;
Low = ((uint)Low) << 8;
}
public void EncodeDirectBits(uint v, int numTotalBits)
{
for (int i = numTotalBits - 1; i >= 0; i--)
{
Range >>= 1;
if (((v >> i) & 1) == 1)
Low += Range;
if (Range < kTopValue)
{
Range <<= 8;
ShiftLow();
}
}
}
public void EncodeBit(uint size0, int numTotalBits, uint symbol)
{
uint newBound = (Range >> numTotalBits) * size0;
if (symbol == 0)
Range = newBound;
else
{
Low += newBound;
Range -= newBound;
}
while (Range < kTopValue)
{
Range <<= 8;
ShiftLow();
}
}
public long GetProcessedSizeAdd()
{
return _cacheSize +
Stream.Position - StartPosition + 4;
// (long)Stream.GetProcessedSize();
}
}
class Decoder
{
public const uint kTopValue = (1 << 24);
public uint Range;
public uint Code;
// public Buffer.InBuffer Stream = new Buffer.InBuffer(1 << 16);
public System.IO.Stream Stream;
public void Init(System.IO.Stream stream)
{
// Stream.Init(stream);
Stream = stream;
Code = 0;
Range = 0xFFFFFFFF;
for (int i = 0; i < 5; i++)
Code = (Code << 8) | (byte)Stream.ReadByte();
}
public void ReleaseStream()
{
// Stream.ReleaseStream();
Stream = null;
}
public void CloseStream()
{
Stream.Close();
}
public void Normalize()
{
while (Range < kTopValue)
{
Code = (Code << 8) | (byte)Stream.ReadByte();
Range <<= 8;
}
}
public void Normalize2()
{
if (Range < kTopValue)
{
Code = (Code << 8) | (byte)Stream.ReadByte();
Range <<= 8;
}
}
public uint GetThreshold(uint total)
{
return Code / (Range /= total);
}
public void Decode(uint start, uint size, uint total)
{
Code -= start * Range;
Range *= size;
Normalize();
}
public uint DecodeDirectBits(int numTotalBits)
{
uint range = Range;
uint code = Code;
uint result = 0;
for (int i = numTotalBits; i > 0; i--)
{
range >>= 1;
/*
result <<= 1;
if (code >= range)
{
code -= range;
result |= 1;
}
*/
uint t = (code - range) >> 31;
code -= range & (t - 1);
result = (result << 1) | (1 - t);
if (range < kTopValue)
{
code = (code << 8) | (byte)Stream.ReadByte();
range <<= 8;
}
}
Range = range;
Code = code;
return result;
}
public uint DecodeBit(uint size0, int numTotalBits)
{
uint newBound = (Range >> numTotalBits) * size0;
uint symbol;
if (Code < newBound)
{
symbol = 0;
Range = newBound;
}
else
{
symbol = 1;
Code -= newBound;
Range -= newBound;
}
Normalize();
return symbol;
}
// ulong GetProcessedSize() {return Stream.GetProcessedSize(); }
}
}
using System;
namespace SevenZip.Compression.RangeCoder
{
struct BitEncoder
{
public const int kNumBitModelTotalBits = 11;
public const uint kBitModelTotal = (1 << kNumBitModelTotalBits);
const int kNumMoveBits = 5;
const int kNumMoveReducingBits = 2;
public const int kNumBitPriceShiftBits = 6;
uint Prob;
public void Init() { Prob = kBitModelTotal >> 1; }
public void UpdateModel(uint symbol)
{
if (symbol == 0)
Prob += (kBitModelTotal - Prob) >> kNumMoveBits;
else
Prob -= (Prob) >> kNumMoveBits;
}
public void Encode(Encoder encoder, uint symbol)
{
// encoder.EncodeBit(Prob, kNumBitModelTotalBits, symbol);
// UpdateModel(symbol);
uint newBound = (encoder.Range >> kNumBitModelTotalBits) * Prob;
if (symbol == 0)
{
encoder.Range = newBound;
Prob += (kBitModelTotal - Prob) >> kNumMoveBits;
}
else
{
encoder.Low += newBound;
encoder.Range -= newBound;
Prob -= (Prob) >> kNumMoveBits;
}
if (encoder.Range < Encoder.kTopValue)
{
encoder.Range <<= 8;
encoder.ShiftLow();
}
}
private static UInt32[] ProbPrices = new UInt32[kBitModelTotal >> kNumMoveReducingBits];
static BitEncoder()
{
const int kNumBits = (kNumBitModelTotalBits - kNumMoveReducingBits);
for (int i = kNumBits - 1; i >= 0; i--)
{
UInt32 start = (UInt32)1 << (kNumBits - i - 1);
UInt32 end = (UInt32)1 << (kNumBits - i);
for (UInt32 j = start; j < end; j++)
ProbPrices[j] = ((UInt32)i << kNumBitPriceShiftBits) +
(((end - j) << kNumBitPriceShiftBits) >> (kNumBits - i - 1));
}
}
public uint GetPrice(uint symbol)
{
return ProbPrices[(((Prob - symbol) ^ ((-(int)symbol))) & (kBitModelTotal - 1)) >> kNumMoveReducingBits];
}
public uint GetPrice0() { return ProbPrices[Prob >> kNumMoveReducingBits]; }
public uint GetPrice1() { return ProbPrices[(kBitModelTotal - Prob) >> kNumMoveReducingBits]; }
}
struct BitDecoder
{
public const int kNumBitModelTotalBits = 11;
public const uint kBitModelTotal = (1 << kNumBitModelTotalBits);
const int kNumMoveBits = 5;
uint Prob;
public void UpdateModel(int numMoveBits, uint symbol)
{
if (symbol == 0)
Prob += (kBitModelTotal - Prob) >> numMoveBits;
else
Prob -= (Prob) >> numMoveBits;
}
public void Init() { Prob = kBitModelTotal >> 1; }
public uint Decode(RangeCoder.Decoder rangeDecoder)
{
uint newBound = (uint)(rangeDecoder.Range >> kNumBitModelTotalBits) * (uint)Prob;
if (rangeDecoder.Code < newBound)
{
rangeDecoder.Range = newBound;
Prob += (kBitModelTotal - Prob) >> kNumMoveBits;
if (rangeDecoder.Range < Decoder.kTopValue)
{
rangeDecoder.Code = (rangeDecoder.Code << 8) | (byte)rangeDecoder.Stream.ReadByte();
rangeDecoder.Range <<= 8;
}
return 0;
}
else
{
rangeDecoder.Range -= newBound;
rangeDecoder.Code -= newBound;
Prob -= (Prob) >> kNumMoveBits;
if (rangeDecoder.Range < Decoder.kTopValue)
{
rangeDecoder.Code = (rangeDecoder.Code << 8) | (byte)rangeDecoder.Stream.ReadByte();
rangeDecoder.Range <<= 8;
}
return 1;
}
}
}
}
using System;
namespace SevenZip.Compression.RangeCoder
{
struct BitTreeEncoder
{
BitEncoder[] Models;
int NumBitLevels;
public BitTreeEncoder(int numBitLevels)
{
NumBitLevels = numBitLevels;
Models = new BitEncoder[1 << numBitLevels];
}
public void Init()
{
for (uint i = 1; i < (1 << NumBitLevels); i++)
Models[i].Init();
}
public void Encode(Encoder rangeEncoder, UInt32 symbol)
{
UInt32 m = 1;
for (int bitIndex = NumBitLevels; bitIndex > 0; )
{
bitIndex--;
UInt32 bit = (symbol >> bitIndex) & 1;
Models[m].Encode(rangeEncoder, bit);
m = (m << 1) | bit;
}
}
public void ReverseEncode(Encoder rangeEncoder, UInt32 symbol)
{
UInt32 m = 1;
for (UInt32 i = 0; i < NumBitLevels; i++)
{
UInt32 bit = symbol & 1;
Models[m].Encode(rangeEncoder, bit);
m = (m << 1) | bit;
symbol >>= 1;
}
}
public UInt32 GetPrice(UInt32 symbol)
{
UInt32 price = 0;
UInt32 m = 1;
for (int bitIndex = NumBitLevels; bitIndex > 0; )
{
bitIndex--;
UInt32 bit = (symbol >> bitIndex) & 1;
price += Models[m].GetPrice(bit);
m = (m << 1) + bit;
}
return price;
}
public UInt32 ReverseGetPrice(UInt32 symbol)
{
UInt32 price = 0;
UInt32 m = 1;
for (int i = NumBitLevels; i > 0; i--)
{
UInt32 bit = symbol & 1;
symbol >>= 1;
price += Models[m].GetPrice(bit);
m = (m << 1) | bit;
}
return price;
}
public static UInt32 ReverseGetPrice(BitEncoder[] Models, UInt32 startIndex,
int NumBitLevels, UInt32 symbol)
{
UInt32 price = 0;
UInt32 m = 1;
for (int i = NumBitLevels; i > 0; i--)
{
UInt32 bit = symbol & 1;
symbol >>= 1;
price += Models[startIndex + m].GetPrice(bit);
m = (m << 1) | bit;
}
return price;
}
public static void ReverseEncode(BitEncoder[] Models, UInt32 startIndex,
Encoder rangeEncoder, int NumBitLevels, UInt32 symbol)
{
UInt32 m = 1;
for (int i = 0; i < NumBitLevels; i++)
{
UInt32 bit = symbol & 1;
Models[startIndex + m].Encode(rangeEncoder, bit);
m = (m << 1) | bit;
symbol >>= 1;
}
}
}
struct BitTreeDecoder
{
BitDecoder[] Models;
int NumBitLevels;
public BitTreeDecoder(int numBitLevels)
{
NumBitLevels = numBitLevels;
Models = new BitDecoder[1 << numBitLevels];
}
public void Init()
{
for (uint i = 1; i < (1 << NumBitLevels); i++)
Models[i].Init();
}
public uint Decode(RangeCoder.Decoder rangeDecoder)
{
uint m = 1;
for (int bitIndex = NumBitLevels; bitIndex > 0; bitIndex--)
m = (m << 1) + Models[m].Decode(rangeDecoder);
return m - ((uint)1 << NumBitLevels);
}
public uint ReverseDecode(RangeCoder.Decoder rangeDecoder)
{
uint m = 1;
uint symbol = 0;
for (int bitIndex = 0; bitIndex < NumBitLevels; bitIndex++)
{
uint bit = Models[m].Decode(rangeDecoder);
m <<= 1;
m += bit;
symbol |= (bit << bitIndex);
}
return symbol;
}
public static uint ReverseDecode(BitDecoder[] Models, UInt32 startIndex,
RangeCoder.Decoder rangeDecoder, int NumBitLevels)
{
uint m = 1;
uint symbol = 0;
for (int bitIndex = 0; bitIndex < NumBitLevels; bitIndex++)
{
uint bit = Models[startIndex + m].Decode(rangeDecoder);
m <<= 1;
m += bit;
symbol |= (bit << bitIndex);
}
return symbol;
}
}
}
// ICoder.h
using System;
namespace SevenZip
{
/// <summary>
/// The exception that is thrown when an error in input stream occurs during decoding.
/// </summary>
class DataErrorException : ApplicationException
{
public DataErrorException(): base("Data Error") { }
}
/// <summary>
/// The exception that is thrown when the value of an argument is outside the allowable range.
/// </summary>
class InvalidParamException : ApplicationException
{
public InvalidParamException(): base("Invalid Parameter") { }
}
public interface ICodeProgress
{
/// <summary>
/// Callback progress.
/// </summary>
/// <param name="inSize">
/// input size. -1 if unknown.
/// </param>
/// <param name="outSize">
/// output size. -1 if unknown.
/// </param>
void SetProgress(Int64 inSize, Int64 outSize);
};
public interface ICoder
{
/// <summary>
/// Codes streams.
/// </summary>
/// <param name="inStream">
/// input Stream.
/// </param>
/// <param name="outStream">
/// output Stream.
/// </param>
/// <param name="inSize">
/// input Size. -1 if unknown.
/// </param>
/// <param name="outSize">
/// output Size. -1 if unknown.
/// </param>
/// <param name="progress">
/// callback progress reference.
/// </param>
/// <exception cref="SevenZip.DataErrorException">
/// if input stream is not valid
/// </exception>
void Code(System.IO.Stream inStream, System.IO.Stream outStream,
Int64 inSize, Int64 outSize, ICodeProgress progress);
};
/*
public interface ICoder2
{
void Code(ISequentialInStream []inStreams,
const UInt64 []inSizes,
ISequentialOutStream []outStreams,
UInt64 []outSizes,
ICodeProgress progress);
};
*/
/// <summary>
/// Provides the fields that represent properties idenitifiers for compressing.
/// </summary>
public enum CoderPropID
{
/// <summary>
/// Specifies default property.
/// </summary>
DefaultProp = 0,
/// <summary>
/// Specifies size of dictionary.
/// </summary>
DictionarySize,
/// <summary>
/// Specifies size of memory for PPM*.
/// </summary>
UsedMemorySize,
/// <summary>
/// Specifies order for PPM methods.
/// </summary>
Order,
/// <summary>
/// Specifies Block Size.
/// </summary>
BlockSize,
/// <summary>
/// Specifies number of postion state bits for LZMA (0 <= x <= 4).
/// </summary>
PosStateBits,
/// <summary>
/// Specifies number of literal context bits for LZMA (0 <= x <= 8).
/// </summary>
LitContextBits,
/// <summary>
/// Specifies number of literal position bits for LZMA (0 <= x <= 4).
/// </summary>
LitPosBits,
/// <summary>
/// Specifies number of fast bytes for LZ*.
/// </summary>
NumFastBytes,
/// <summary>
/// Specifies match finder. LZMA: "BT2", "BT4" or "BT4B".
/// </summary>
MatchFinder,
/// <summary>
/// Specifies the number of match finder cyckes.
/// </summary>
MatchFinderCycles,
/// <summary>
/// Specifies number of passes.
/// </summary>
NumPasses,
/// <summary>
/// Specifies number of algorithm.
/// </summary>
Algorithm,
/// <summary>
/// Specifies the number of threads.
/// </summary>
NumThreads,
/// <summary>
/// Specifies mode with end marker.
/// </summary>
EndMarker
};
public interface ISetCoderProperties
{
void SetCoderProperties(CoderPropID[] propIDs, object[] properties);
};
public interface IWriteCoderProperties
{
void WriteCoderProperties(System.IO.Stream outStream);
}
public interface ISetDecoderProperties
{
void SetDecoderProperties(byte[] properties);
}
}
using System;
using System.Collections.Generic;
using System.Text;
namespace LemmaSharp.Classes
{
static class Constants
{
public const string Separator = "|";
}
}
using System;
using System.IO;
namespace LemmaSharp.Classes {
public class LemmaRule {
// Private Variables -----------------------
private int iId;
private int iFrom;
private string sFrom;
private string sTo;
private string sSignature;
private LemmatizerSettings lsett;
// Constructor(s) & Destructor(s) ---------
public LemmaRule(string sWord, string sLemma, int iId, LemmatizerSettings lsett) {
this.lsett = lsett;
this.iId = iId;
int iSameStem = SameStem(sWord, sLemma);
sTo = sLemma.Substring(iSameStem);
iFrom = sWord.Length - iSameStem;
if (lsett.bUseFromInRules) {
sFrom = sWord.Substring(iSameStem);
sSignature = "[" + sFrom + "]==>[" + sTo + "]";
}
else {
sFrom = null;
sSignature = "[#" + iFrom + "]==>[" + sTo + "]";
}
}
// Public Properties ---------------------
public string Signature {
get {
return sSignature;
}
}
public int Id {
get {
return iId;
}
}
// Essential Class Functions -------------
private static int SameStem(string sStr1, string sStr2) {
int iLen1 = sStr1.Length;
int iLen2 = sStr2.Length;
int iMaxLen = Math.Min(iLen1, iLen2);
for (int iPos = 0; iPos < iMaxLen; iPos++)
if (sStr1[iPos] != sStr2[iPos]) return iPos;
return iMaxLen;
}
public bool IsApplicableToGroup(int iGroupCondLen) {
return iGroupCondLen >= iFrom;
}
public string Lemmatize(string sWord)
{
// if the removed part is upper, replace by an uppercase string
var isRemovedPartUpper = IsFullyUpper(sWord.Substring(sWord.Length - iFrom, iFrom));
return sWord.Substring(0, sWord.Length - iFrom) + (isRemovedPartUpper ? sTo.ToUpper() : sTo);
}
// Output Functions (ToString) ----------
public override string ToString() {
return iId + ":" + sSignature;
}
// Serialization Functions (regular) -----
public void Serialize(StreamWriter sWrt, bool bThisTopObject)
{
//save metadata
sWrt.Write(bThisTopObject); sWrt.Write(Constants.Separator);
//save value types --------------------------------------
sWrt.Write(iId); sWrt.Write(Constants.Separator);
sWrt.Write(iFrom); sWrt.Write(Constants.Separator);
if (sFrom == null)
{
sWrt.Write(false); sWrt.Write(Constants.Separator);
}
else
{
sWrt.Write(true); sWrt.Write(Constants.Separator);
sWrt.Write(sFrom); sWrt.Write(Constants.Separator);
}
sWrt.Write(sTo); sWrt.Write(Constants.Separator);
sWrt.Write(sSignature); sWrt.Write(Constants.Separator);
if (bThisTopObject)
{
lsett.Serialize(sWrt); sWrt.Write(Constants.Separator);
}
sWrt.WriteLine();
}
// Serialization Functions (Binary) -----
public void Serialize(BinaryWriter binWrt, bool bThisTopObject) {
//save metadata
binWrt.Write(bThisTopObject);
//save value types --------------------------------------
binWrt.Write(iId);
binWrt.Write(iFrom);
if (sFrom == null)
{
binWrt.Write(false);
}
else
{
binWrt.Write(true);
binWrt.Write(sFrom);
}
binWrt.Write(sTo);
binWrt.Write(sSignature);
if (bThisTopObject)
{
lsett.Serialize(binWrt);
}
}
public void Deserialize(BinaryReader binRead, LemmatizerSettings lsett) {
//load metadata
bool bThisTopObject = binRead.ReadBoolean();
//load value types --------------------------------------
iId = binRead.ReadInt32();
iFrom = binRead.ReadInt32();
if (binRead.ReadBoolean())
{
sFrom = binRead.ReadString();
}
else
{
sFrom = null;
}
sTo = binRead.ReadString();
sSignature = binRead.ReadString();
//load refernce types if needed -------------------------
if (bThisTopObject)
{
this.lsett = new LemmatizerSettings(binRead);
}
else
{
this.lsett = lsett;
}
}
public LemmaRule(BinaryReader binRead, LemmatizerSettings lsett) {
this.Deserialize(binRead, lsett);
}
// Serialization Functions (Latino) -----
#if LATINO
public void Save(Latino.BinarySerializer binWrt, bool bThisTopObject) {
//save metadata
binWrt.WriteBool(bThisTopObject);
//save value types --------------------------------------
binWrt.WriteInt(iId);
binWrt.WriteInt(iFrom);
if (sFrom == null)
binWrt.WriteBool(false);
else {
binWrt.WriteBool(true);
binWrt.WriteString(sFrom);
}
binWrt.WriteString(sTo);
binWrt.WriteString(sSignature);
if (bThisTopObject)
lsett.Save(binWrt);
}
public void Load(Latino.BinarySerializer binRead, LemmatizerSettings lsett) {
//load metadata
bool bThisTopObject = binRead.ReadBool();
//load value types --------------------------------------
iId = binRead.ReadInt();
iFrom = binRead.ReadInt();
if (binRead.ReadBool())
sFrom = binRead.ReadString();
else
sFrom = null;
sTo = binRead.ReadString();
sSignature = binRead.ReadString();
//load refernce types if needed -------------------------
if (bThisTopObject)
this.lsett = new LemmatizerSettings(binRead);
else
this.lsett = lsett;
}
public LemmaRule(Latino.BinarySerializer binRead, LemmatizerSettings lsett) {
Load(binRead, lsett);
}
#endif
// String utilities ------
public static bool IsFullyUpper(string value)
{
if (string.IsNullOrEmpty(value)){ return false; }
// Consider string to be uppercase if it has no lowercase letters.
for (int i = 0; i < value.Length; i++)
{
if (char.IsLower(value[i]))
{
return false;
}
}
return true;
}
}
}
using System;
using System.IO;
using System.Runtime.Serialization;
namespace LemmaSharp.Classes {
/// <summary>
/// These are the lemmagen algorithm settings that affect speed/power of the learning and lemmatizing algorithm.
/// TODO this class will be probbably removed in the future.
/// </summary>
[Serializable()]
public class LemmatizerSettings : ISerializable {
// Constructor(s) & Destructor(s) -------------------
public LemmatizerSettings() { }
// Sub-Structures ----------------------------------
/// <summary>
/// How algorithm considers msd tags.
/// MSD stands for the wordform morphosyntactic description.
/// This is the set of all lemmas starting with "writ-", as they appear in the Multext English lexicon
/// </summary>
public enum MsdConsideration {
/// <summary>
/// Completely ignores msd tags (join examples with different tags and sum their weihgts).
/// </summary>
Ignore,
/// <summary>
/// Same examples with different msd's are not considered equal and joined.
/// </summary>
Distinct,
/// <summary>
/// Joins examples with different tags (concatenates all msd tags).
/// </summary>
JoinAll,
/// <summary>
/// Joins examples with different tags (concatenates just distinct msd tags - somehow slower).
/// </summary>
JoinDistinct,
/// <summary>
/// Joins examples with different tags (new tag is the left to right substring that all joined examples share).
/// </summary>
JoinSameSubstring
}
// Public Variables --------------------------------
/// <summary>
/// True if from string should be included in rule identifier ([from]->[to]). False if just length of from string is used ([#len]->[to]).
/// </summary>
public bool bUseFromInRules = true;
/// <summary>
/// Specification how algorithm considers msd tags.
/// </summary>
public MsdConsideration eMsdConsider = MsdConsideration.Distinct;
/// <summary>
/// How many of the best rules are kept in memory for each node. Zero means unlimited.
/// </summary>
public int iMaxRulesPerNode = 0;
/// <summary>
/// If true, than build proccess uses few more hevristics to build first left to right lemmatizer (lemmatizes front of the word)
/// </summary>
public bool bBuildFrontLemmatizer = false;
// Cloneable functions --------------------------------
public LemmatizerSettings CloneDeep() {
return new LemmatizerSettings() {
bUseFromInRules = this.bUseFromInRules,
eMsdConsider = this.eMsdConsider,
iMaxRulesPerNode = this.iMaxRulesPerNode,
bBuildFrontLemmatizer = this.bBuildFrontLemmatizer
};
}
// Serialization Functions (ISerializable) -----------
public void GetObjectData(SerializationInfo info, StreamingContext context) {
info.AddValue("bUseFromInRules", bUseFromInRules);
info.AddValue("eMsdConsider", eMsdConsider);
info.AddValue("iMaxRulesPerNode", iMaxRulesPerNode);
info.AddValue("bBuildFrontLemmatizer", bBuildFrontLemmatizer);
}
public LemmatizerSettings(SerializationInfo info, StreamingContext context) {
bUseFromInRules = info.GetBoolean("bUseFromInRules");
eMsdConsider = (MsdConsideration)info.GetValue("eMsdConsider", typeof(MsdConsideration));
iMaxRulesPerNode = info.GetInt32("iMaxRulesPerNode");
bBuildFrontLemmatizer = info.GetBoolean("bBuildFrontLemmatizer");
}
// Serialization Functions (regular) ----------------
public void Serialize(StreamWriter sWrt)
{
sWrt.Write(bUseFromInRules); sWrt.Write(Constants.Separator);
sWrt.Write((int)eMsdConsider); sWrt.Write(Constants.Separator);
sWrt.Write(iMaxRulesPerNode); sWrt.Write(Constants.Separator);
sWrt.Write(bBuildFrontLemmatizer); sWrt.Write(Constants.Separator);
sWrt.WriteLine();
}
// Serialization Functions (Binary) -----------------
public void Serialize(BinaryWriter binWrt) {
binWrt.Write(bUseFromInRules);
binWrt.Write((int)eMsdConsider);
binWrt.Write(iMaxRulesPerNode);
binWrt.Write(bBuildFrontLemmatizer);
}
public void Deserialize(BinaryReader binRead) {
bUseFromInRules = binRead.ReadBoolean();
eMsdConsider = (MsdConsideration)binRead.ReadInt32();
iMaxRulesPerNode = binRead.ReadInt32();
bBuildFrontLemmatizer = binRead.ReadBoolean();
}
public LemmatizerSettings(BinaryReader binRead) {
this.Deserialize(binRead);
}
// Serialization Functions (Latino) -----------------
#if LATINO
public void Save(Latino.BinarySerializer binWrt) {
binWrt.WriteBool(bUseFromInRules);
binWrt.WriteInt((int)eMsdConsider);
binWrt.WriteInt(iMaxRulesPerNode);
binWrt.WriteBool(bBuildFrontLemmatizer);
}
public void Load(Latino.BinarySerializer binRead) {
bUseFromInRules = binRead.ReadBool();
eMsdConsider = (MsdConsideration)binRead.ReadInt();
iMaxRulesPerNode = binRead.ReadInt();
bBuildFrontLemmatizer = binRead.ReadBool();
}
public LemmatizerSettings(Latino.BinarySerializer reader) {
Load(reader);
}
#endif
}
}
using System.Collections.Generic;
using System.IO;
namespace LemmaSharp.Classes {
public class RuleList : Dictionary<string, LemmaRule> {
// Private Variables ------------------------
private LemmatizerSettings lsett;
private LemmaRule lrDefaultRule;
// Constructor(s) & Destructor(s) ------------
public RuleList(LemmatizerSettings lsett) {
this.lsett = lsett;
lrDefaultRule = AddRule(new LemmaRule("", "", 0, lsett));
}
// Public Properties -----------------------
public LemmaRule DefaultRule {
get {
return lrDefaultRule;
}
}
// Essential Class Functions --------------
public LemmaRule AddRule(LemmaExample le) {
return AddRule(new LemmaRule(le.Word, le.Lemma, this.Count, lsett));
}
private LemmaRule AddRule(LemmaRule lrRuleNew) {
LemmaRule lrRuleReturn = null;
if (!this.TryGetValue(lrRuleNew.Signature, out lrRuleReturn)) {
lrRuleReturn = lrRuleNew;
this.Add(lrRuleReturn.Signature, lrRuleReturn);
}
return lrRuleReturn;
}
// Serialization Functions (regular) ------
public void Serialize(StreamWriter sWrt, bool bThisTopObject)
{
//save metadata
sWrt.Write(bThisTopObject); sWrt.WriteLine(Constants.Separator);
//save value types --------------------------------------
//save refernce types if needed -------------------------
if (bThisTopObject)
{
lsett.Serialize(sWrt);
}
//save list items ---------------------------------------
int iCount = this.Count;
sWrt.WriteLine(iCount);
foreach (KeyValuePair<string, LemmaRule> kvp in this)
{
sWrt.WriteLine(kvp.Key);
kvp.Value.Serialize(sWrt, false);
}
//default rule is already saved in the list. Here just save its id.
sWrt.WriteLine(lrDefaultRule.Signature);
}
// Serialization Functions (Binary) ------
public void Serialize(BinaryWriter binWrt, bool bThisTopObject) {
//save metadata
binWrt.Write(bThisTopObject);
//save value types --------------------------------------
//save refernce types if needed -------------------------
if (bThisTopObject)
{
lsett.Serialize(binWrt);
}
//save list items ---------------------------------------
int iCount = this.Count;
binWrt.Write(iCount);
foreach (KeyValuePair<string, LemmaRule> kvp in this) {
binWrt.Write(kvp.Key);
kvp.Value.Serialize(binWrt, false);
}
//default rule is already saved in the list. Here just save its id.
binWrt.Write(lrDefaultRule.Signature);
}
public void Deserialize(BinaryReader binRead, LemmatizerSettings lsett) {
//load metadata
bool bThisTopObject = binRead.ReadBoolean();
//load value types --------------------------------------
//load refernce types if needed -------------------------
this.lsett = bThisTopObject ? new LemmatizerSettings(binRead) : lsett;
//load list items ---------------------------------------
this.Clear();
int iCount = binRead.ReadInt32();
for (int iId = 0; iId < iCount; iId++) {
string sKey = binRead.ReadString();
var lrVal = new LemmaRule(binRead, this.lsett);
this.Add(sKey, lrVal);
}
//link the default rule just Id was saved.
lrDefaultRule = this[binRead.ReadString()];
}
public RuleList(BinaryReader binRead, LemmatizerSettings lsett) {
this.Deserialize(binRead, lsett);
}
// Serialization Functions (Latino) ------
#if LATINO
public void Save(Latino.BinarySerializer binWrt, bool bThisTopObject) {
//save metadata
binWrt.WriteBool(bThisTopObject);
//save value types --------------------------------------
//save refernce types if needed -------------------------
if (bThisTopObject)
lsett.Save(binWrt);
//save list items ---------------------------------------
int iCount = this.Count;
binWrt.WriteInt(iCount);
foreach (KeyValuePair<string, LemmaRule> kvp in this) {
binWrt.WriteString(kvp.Key);
kvp.Value.Save(binWrt, false);
}
//default rule is already saved in the list. Here just save its id.
binWrt.WriteString(lrDefaultRule.Signature);
}
public void Load(Latino.BinarySerializer binRead, LemmatizerSettings lsett) {
//load metadata
bool bThisTopObject = binRead.ReadBool();
//load value types --------------------------------------
//load refernce types if needed -------------------------
if (bThisTopObject)
this.lsett = new LemmatizerSettings(binRead);
else
this.lsett = lsett;
//load list items ---------------------------------------
this.Clear();
int iCount = binRead.ReadInt();
for (int iId = 0; iId < iCount; iId++) {
string sKey = binRead.ReadString();
LemmaRule lrVal = new LemmaRule(binRead, this.lsett);
this.Add(sKey, lrVal);
}
//link the default rule just Id was saved.
lrDefaultRule = this[binRead.ReadString()];
}
public RuleList(Latino.BinarySerializer binRead, LemmatizerSettings lsett) {
Load(binRead, lsett);
}
#endif
}
}
using System;
namespace LemmaSharp.Classes {
[Serializable]
class RuleWeighted: IComparable<RuleWeighted>{
// Private Variables ---------------------
private LemmaRule lrRule;
private double dWeight;
// Constructor(s) & Destructor(s) -------
public RuleWeighted(LemmaRule lrRule, double dWeight) {
this.lrRule = lrRule;
this.dWeight = dWeight;
}
// Public Properties --------------------
public LemmaRule Rule {
get { return lrRule; }
}
public double Weight {
get { return dWeight; }
}
// Essential Class Functions (comparing objects, eg.: for sorting) -------
public int CompareTo(RuleWeighted rl) {
if (this.dWeight < rl.dWeight) return 1;
if (this.dWeight > rl.dWeight) return -1;
if (this.lrRule.Id < rl.lrRule.Id) return 1;
if (this.lrRule.Id > rl.lrRule.Id) return -1;
return 0;
}
// Output & Serialization Functions -----------
public override string ToString() {
return lrRule.ToString() + dWeight.ToString("(0.00%)");
}
}
}
using System;
using System.Collections.Generic;
using System.Text;
using System.Runtime.Serialization;
namespace LemmaSharp {
public interface ILemmatizer : ISerializable {
string Lemmatize(string word);
}
}
using System;
namespace LemmaSharp {
public interface ILemmatizerModel {
string Lemmatize(string sWord);
string ToString();
}
}
using System;
using System.Collections.Generic;
//using System.Linq;
using System.Text;
using LemmaSharp.Classes;
namespace LemmaSharp {
public interface ITrainableLemmatizer:ILemmatizer {
ExampleList Examples {
get;
}
ILemmatizerModel Model {
get;
}
void AddExample(string sWord, string sLemma);
void AddExample(string sWord, string sLemma, double dWeight);
void AddExample(string sWord, string sLemma, double dWeight, string sMsd);
void BuildModel();
}
}
using System.Reflection;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
// 有关程序集的一般信息由以下
// 控制。更改这些特性值可修改
// 与程序集关联的信息。
[assembly: AssemblyTitle("Jieba.Net")]
[assembly: AssemblyDescription("")]
[assembly: AssemblyConfiguration("")]
[assembly: AssemblyCompany("")]
[assembly: AssemblyProduct("Jieba.Net")]
[assembly: AssemblyCopyright("Copyright © 2021")]
[assembly: AssemblyTrademark("")]
[assembly: AssemblyCulture("")]
// 将 ComVisible 设置为 false 会使此程序集中的类型
//对 COM 组件不可见。如果需要从 COM 访问此程序集中的类型
//请将此类型的 ComVisible 特性设置为 true。
[assembly: ComVisible(false)]
// 如果此项目向 COM 公开,则下列 GUID 用于类型库的 ID
[assembly: Guid("e4554146-b7ea-464b-9ee2-5923f7721e64")]
// 程序集的版本信息由下列四个值组成:
//
// 主版本
// 次版本
// 生成号
// 修订号
//
// 可以指定所有值,也可以使用以下所示的 "*" 预置版本号和修订号
//通过使用 "*",如下所示:
// [assembly: AssemblyVersion("1.0.*")]
[assembly: AssemblyVersion("1.0.0.0")]
[assembly: AssemblyFileVersion("1.0.0.0")]
using System;
using System.Collections.Generic;
using System.Linq;
namespace JiebaNet.Segmenter.Common
{
public interface ICounter<T>
{
int Count { get; }
int Total { get; }
int this[T key] { get; set; }
IEnumerable<KeyValuePair<T, int>> Elements { get; }
/// <summary>
/// Lists the n most common elements from the most common to the least.
/// </summary>
/// <param name="n">Number of elements, list all elements if n is less than 0.</param>
/// <returns></returns>
IEnumerable<KeyValuePair<T, int>> MostCommon(int n = -1);
/// <summary>
/// Subtracts items from a counter.
/// </summary>
/// <param name="items"></param>
void Subtract(IEnumerable<T> items);
/// <summary>
/// Subtracts counts from another counter.
/// </summary>
/// <param name="other"></param>
void Subtract(ICounter<T> other);
/// <summary>
/// Adds items to a counter.
/// </summary>
/// <param name="items"></param>
void Add(IEnumerable<T> items);
/// <summary>
/// Adds another counter.
/// </summary>
/// <param name="other"></param>
void Add(ICounter<T> other);
/// <summary>
/// Union is the maximum of value in either of the input <see cref="ICounter{T}"/>.
/// </summary>
/// <param name="other">The other counter.</param>
ICounter<T> Union(ICounter<T> other);
void Remove(T key);
void Clear();
bool Contains(T key);
}
public class Counter<T>: ICounter<T>
{
private Dictionary<T, int> data = new Dictionary<T, int>();
public Counter() {}
public Counter(IEnumerable<T> items)
{
CountItems(items);
}
public int Count => data.Count;
public int Total => data.Values.Sum();
public IEnumerable<KeyValuePair<T, int>> Elements => data;
public int this[T key]
{
get => data.ContainsKey(key) ? data[key] : 0;
set => data[key] = value;
}
public IEnumerable<KeyValuePair<T, int>> MostCommon(int n = -1)
{
var pairs = data.Where(pair => pair.Value > 0).OrderByDescending(pair => pair.Value);
return n < 0 ? pairs : pairs.Take(n);
}
public void Subtract(IEnumerable<T> items)
{
SubtractItems(items);
}
public void Subtract(ICounter<T> other)
{
SubtractPairs(other.Elements);
}
public void Add(IEnumerable<T> items)
{
CountItems(items);
}
public void Add(ICounter<T> other)
{
CountPairs(other.Elements);
}
public ICounter<T> Union(ICounter<T> other)
{
var result = new Counter<T>();
foreach (var pair in data)
{
var count = pair.Value;
var otherCount = other[pair.Key];
var newCount = count < otherCount ? otherCount : count;
result[pair.Key] = newCount;
}
foreach (var pair in other.Elements)
{
if (!Contains(pair.Key))
{
result[pair.Key] = pair.Value;
}
}
return result;
}
public void Remove(T key)
{
if (data.ContainsKey(key))
{
data.Remove(key);
}
}
public void Clear()
{
data.Clear();
}
public bool Contains(T key)
{
return data.ContainsKey(key);
}
#region Private Methods
private void CountItems(IEnumerable<T> items)
{
foreach (var item in items)
{
data[item] = data.GetDefault(item, 0) + 1;
}
}
private void CountPairs(IEnumerable<KeyValuePair<T, int>> pairs)
{
foreach (var pair in pairs)
{
this[pair.Key] += pair.Value;
}
}
private void SubtractItems(IEnumerable<T> items)
{
foreach (var item in items)
{
data[item] = data.GetDefault(item, 0) - 1;
}
}
private void SubtractPairs(IEnumerable<KeyValuePair<T, int>> pairs)
{
foreach (var pair in pairs)
{
this[pair.Key] -= pair.Value;
}
}
#endregion
}
}
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text.RegularExpressions;
namespace JiebaNet.Segmenter.Common
{
public static class Extensions
{
private static readonly Regex RegexDigits = new Regex(@"\d+", RegexOptions.Compiled);
private static readonly Regex RegexNewline = new Regex("(\r\n|\n|\r)", RegexOptions.Compiled);
#region Objects
public static bool IsNull(this object obj)
{
return obj == null;
}
public static bool IsNotNull(this object obj)
{
return obj != null;
}
#endregion
#region Enumerable
public static bool IsEmpty<T>(this IEnumerable<T> enumerable)
{
return (enumerable == null) || !enumerable.Any();
}
public static bool IsNotEmpty<T>(this IEnumerable<T> enumerable)
{
return (enumerable != null) && enumerable.Any();
}
public static TValue GetOrDefault<TKey, TValue>(this IDictionary<TKey, TValue> d, TKey key)
{
return d.ContainsKey(key) ? d[key] : default(TValue);
}
public static TValue GetDefault<TKey, TValue>(this IDictionary<TKey, TValue> dict, TKey key, TValue defaultValue)
{
if (dict.ContainsKey(key))
{
return dict[key];
}
return defaultValue;
}
public static IDictionary<TKey, TValue> SetDefault<TKey, TValue>(this IDictionary<TKey, TValue> dict, TKey key, TValue defaultValue)
{
if (!dict.ContainsKey(key))
{
dict[key] = defaultValue;
}
return dict;
}
public static void Update<TKey, TValue>(this IDictionary<TKey, TValue> dict, IDictionary<TKey, TValue> other)
{
foreach (var key in other.Keys)
{
dict[key] = other[key];
}
}
#endregion
#region String & Text
public static string Left(this string s, int endIndex)
{
if (string.IsNullOrEmpty(s))
{
return s;
}
return s.Substring(0, endIndex);
}
public static string Right(this string s, int startIndex)
{
if (string.IsNullOrEmpty(s))
{
return s;
}
return s.Substring(startIndex);
}
public static string Sub(this string s, int startIndex, int endIndex)
{
return s.Substring(startIndex, endIndex - startIndex);
}
public static bool IsInt32(this string s)
{
return RegexDigits.IsMatch(s);
}
public static string[] SplitLines(this string s)
{
return s.Split(new[] { "\r\n", "\r", "\n" }, StringSplitOptions.None);
}
public static string Join(this IEnumerable<string> inputs, string separator = ", ")
{
return string.Join(separator, inputs);
}
public static IEnumerable<string> SubGroupValues(this GroupCollection groups)
{
var result = from Group g in groups
select g.Value;
return result.Skip(1);
}
#endregion
#region Conversion
public static int ToInt32(this char ch)
{
return ch;
}
public static char ToChar(this int i)
{
return (char)i;
}
#endregion
}
}
\ No newline at end of file
using System;
using System.Collections.Generic;
using System.Linq;
namespace JiebaNet.Segmenter.Common
{
public class KeywordTrieNode
{
private IDictionary<char, KeywordTrieNode> _children;
// private string _value;
public KeywordTrieNode(string value = null)
{
_children = new Dictionary<char, KeywordTrieNode>();
Value = value;
}
public string Value { get; set; }
public bool HasValue => Value.IsNotNull();
public KeywordTrieNode AddChild(char ch, string value = null, bool overwrite = false)
{
var child = _children.GetOrDefault(ch);
if (child.IsNull())
{
child = new KeywordTrieNode(value);
_children[ch] = child;
}
else if (overwrite)
{
child.Value = value;
}
return child;
}
public KeywordTrieNode GetChild(char ch)
{
var child = _children.GetOrDefault(ch);
return child;
}
public bool HasChild(char ch)
{
return _children.ContainsKey(ch);
}
}
public class KeywordTrie: KeywordTrieNode
{
public KeywordTrie()
{
Count = 0;
}
public int Count { get; set; }
public bool Contains(string key)
{
return GetItem(key).IsNotNull();
}
public void Remove(string key)
{
// TODO: impl and count
this[key] = null;
}
public string this[string key]
{
get { return GetItem(key); }
set { SetItem(key, value); }
}
#region Private Methods
private string GetItem(string key)
{
KeywordTrieNode state = this;
foreach (var ch in key)
{
state = state.GetChild(ch);
if (state.IsNull())
{
return null;
}
}
return state.Value;
}
private void SetItem(string key, string value)
{
KeywordTrieNode state = this;
for (int i = 0; i < key.Length; i++)
{
if (i < key.Length - 1)
{
state = state.AddChild(key[i]);
}
else
{
var child = state.GetChild(key[i]);
state = state.AddChild(key[i], value, true);
if (child.IsNull() || !child.HasValue)
{
Count += 1;
}
}
}
}
#endregion
}
public class TextSpan
{
public string Text { get; }
public int Start { get; }
public int End { get; }
public TextSpan(string text, int start, int end)
{
Text = text;
Start = start;
End = end;
}
public override bool Equals(object obj)
{
return Equals(obj as TextSpan);
}
public bool Equals(TextSpan span)
{
if (ReferenceEquals(span, null))
{
return false;
}
if (ReferenceEquals(this, span))
{
return true;
}
if (this.GetType() != span.GetType())
{
return false;
}
return Text == span.Text && Start == span.Start && End == span.End;
}
public static bool operator ==(TextSpan lhs, TextSpan rhs)
{
if (ReferenceEquals(lhs, null))
{
if (ReferenceEquals(rhs, null))
{
return true;
}
return false;
}
return lhs.Equals(rhs);
}
public static bool operator !=(TextSpan lhs, TextSpan rhs)
{
return !(lhs == rhs);
}
public override int GetHashCode()
{
var hash = 13;
hash = (hash * 7) + Text.GetHashCode();
hash = (hash * 7) + Start.GetHashCode();
hash = (hash * 7) + End.GetHashCode();
return hash;
}
public override string ToString()
{
return $"<{Text}({Start}, {End})>";
}
}
}
using System;
using System.Collections.Generic;
using System.Linq;
namespace JiebaNet.Segmenter.Common
{
// Refer to: https://github.com/brianfromoregon/trie
public class TrieNode
{
public char Char { get; set; }
public int Frequency { get; set; }
public Dictionary<char, TrieNode> Children { get; set; }
public TrieNode(char ch)
{
Char = ch;
Frequency = 0;
// TODO: or an empty dict?
//Children = null;
}
public int Insert(string s, int pos, int freq = 1)
{
if (string.IsNullOrEmpty(s) || pos >= s.Length)
{
return 0;
}
if (Children == null)
{
Children = new Dictionary<char, TrieNode>();
}
var c = s[pos];
if (!Children.ContainsKey(c))
{
Children[c] = new TrieNode(c);
}
var curNode = Children[c];
if (pos == s.Length - 1)
{
curNode.Frequency += freq;
return curNode.Frequency;
}
return curNode.Insert(s, pos + 1, freq);
}
public TrieNode Search(string s, int pos)
{
if (string.IsNullOrEmpty(s))
{
return null;
}
// if out of range or without any child nodes
if (pos >= s.Length || Children == null)
{
return null;
}
// if reaches the last char of s, it's time to make the decision.
if (pos == s.Length - 1)
{
return Children.ContainsKey(s[pos]) ? Children[s[pos]] : null;
}
// continue if necessary.
return Children.ContainsKey(s[pos]) ? Children[s[pos]].Search(s, pos + 1) : null;
}
}
public interface ITrie
{
//string BestMatch(string word, long maxTime);
bool Contains(string word);
int Frequency(string word);
int Insert(string word, int freq = 1);
//bool Remove(string word);
int Count { get; }
int TotalFrequency { get; }
}
public class Trie : ITrie
{
private static readonly char RootChar = '\0';
internal TrieNode Root;
public int Count { get; private set; }
public int TotalFrequency { get; private set; }
public Trie()
{
Root = new TrieNode(RootChar);
Count = 0;
}
public bool Contains(string word)
{
CheckWord(word);
var node = Root.Search(word.Trim(), 0);
return node.IsNotNull() && node.Frequency > 0;
}
public bool ContainsPrefix(string word)
{
CheckWord(word);
var node = Root.Search(word.Trim(), 0);
return node.IsNotNull();
}
public int Frequency(string word)
{
CheckWord(word);
var node = Root.Search(word.Trim(), 0);
return node.IsNull() ? 0 : node.Frequency;
}
public int Insert(string word, int freq = 1)
{
CheckWord(word);
var i = Root.Insert(word.Trim(), 0, freq);
if (i > 0)
{
TotalFrequency += freq;
Count++;
}
return i;
}
public IEnumerable<char> ChildChars(string prefix)
{
var node = Root.Search(prefix.Trim(), 0);
return node.IsNull() || node.Children.IsNull() ? null : node.Children.Select(p => p.Key);
}
private void CheckWord(string word)
{
if (string.IsNullOrWhiteSpace(word))
{
throw new ArgumentException("word must not be null or whitespace");
}
}
}
}
using System;
using System.Configuration;
using System.IO;
using JiebaNet.Segmenter.Common;
namespace JiebaNet.Segmenter
{
public class ConfigManager
{
private static string _configFileBaseDir = null;
public static string ConfigFileBaseDir
{
get
{
if (_configFileBaseDir.IsNull())
{
throw new Exception("NEED CONFIG");
//var configFileDir = ConfigurationManager.AppSettings["JiebaConfigFileDir"] ?? "Resources";
//if (!Path.IsPathRooted(configFileDir))
//{
// var domainDir = AppDomain.CurrentDomain.BaseDirectory;
// configFileDir = Path.GetFullPath(Path.Combine(domainDir, configFileDir));
//}
//_configFileBaseDir = configFileDir;
}
return _configFileBaseDir;
}
set { _configFileBaseDir = value; }
}
//public static string MainDictFile
//{
// get { return Path.Combine(ConfigFileBaseDir, "dict.txt"); }
//}
//public static string ProbTransFile
//{
// get { return Path.Combine(ConfigFileBaseDir, "prob_trans.json"); }
//}
//public static string ProbEmitFile
//{
// get { return Path.Combine(ConfigFileBaseDir, "prob_emit.json"); }
//}
//public static string PosProbStartFile
//{
// get { return Path.Combine(ConfigFileBaseDir, "pos_prob_start.json"); }
//}
//public static string PosProbTransFile
//{
// get { return Path.Combine(ConfigFileBaseDir, "pos_prob_trans.json"); }
//}
//public static string PosProbEmitFile
//{
// get { return Path.Combine(ConfigFileBaseDir, "pos_prob_emit.json"); }
//}
//public static string CharStateTabFile
//{
// get { return Path.Combine(ConfigFileBaseDir, "char_state_tab.json"); }
//}
//public static string IdfFile => Path.Combine(ConfigFileBaseDir, "idf.txt");
//public static string StopWordsFile => Path.Combine(ConfigFileBaseDir, "stopwords.txt");
}
}
\ No newline at end of file
using System.Collections.Generic;
using System.Linq;
namespace JiebaNet.Segmenter
{
public class Constants
{
public static readonly double MinProb = -3.14e100;
public static readonly List<string> NounPos = new List<string>() { "n", "ng", "nr", "nrfg", "nrt", "ns", "nt", "nz" };
public static readonly List<string> VerbPos = new List<string>() { "v", "vd", "vg", "vi", "vn", "vq" };
public static readonly List<string> NounAndVerbPos = NounPos.Union(VerbPos).ToList();
public static readonly List<string> IdiomPos = new List<string>() { "i" };
}
}
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace JiebaNet.Segmenter
{
public class DefaultDictionary<TKey, TValue> : Dictionary<TKey, TValue>
{
public new TValue this[TKey key]
{
get
{
if (!ContainsKey(key))
{
Add(key, default(TValue));
}
return base[key];
}
set { base[key] = value; }
}
}
}
using System;
using System.Collections.Generic;
namespace JiebaNet.Segmenter.FinalSeg
{
/// <summary>
/// 在词典切分之后,使用此接口进行切分,默认实现为HMM方法。
/// </summary>
public interface IFinalSeg
{
IEnumerable<string> Cut(string sentence);
}
}
\ No newline at end of file
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Text.RegularExpressions;
using JiebaNet.Segmenter.Common;
using Kivii.Text;
using Kivii;
using System.Reflection;
namespace JiebaNet.Segmenter.FinalSeg
{
public class Viterbi : IFinalSeg
{
private static readonly Lazy<Viterbi> Lazy = new Lazy<Viterbi>(() => new Viterbi());
private static readonly char[] States = { 'B', 'M', 'E', 'S' };
private static readonly Regex RegexChinese = new Regex(@"([\u4E00-\u9FD5]+)", RegexOptions.Compiled);
private static readonly Regex RegexSkip = new Regex(@"([a-zA-Z0-9]+(?:\.\d+)?%?)", RegexOptions.Compiled);
private static IDictionary<char, IDictionary<char, double>> _emitProbs;
private static IDictionary<char, double> _startProbs;
private static IDictionary<char, IDictionary<char, double>> _transProbs;
private static IDictionary<char, char[]> _prevStatus;
private Viterbi()
{
LoadModel();
}
// TODO: synchronized
public static Viterbi Instance
{
get { return Lazy.Value; }
}
public IEnumerable<string> Cut(string sentence)
{
var tokens = new List<string>();
foreach (var blk in RegexChinese.Split(sentence))
{
if (RegexChinese.IsMatch(blk))
{
tokens.AddRange(ViterbiCut(blk));
}
else
{
var segments = RegexSkip.Split(blk).Where(seg => !string.IsNullOrEmpty(seg));
tokens.AddRange(segments);
}
}
return tokens;
}
#region Private Helpers
private void LoadModel()
{
var stopWatch = new Stopwatch();
stopWatch.Start();
_prevStatus = new Dictionary<char, char[]>()
{
{'B', new []{'E', 'S'}},
{'M', new []{'M', 'B'}},
{'S', new []{'S', 'E'}},
{'E', new []{'B', 'M'}}
};
_startProbs = new Dictionary<char, double>()
{
{'B', -0.26268660809250016},
{'E', -3.14e+100},
{'M', -3.14e+100},
{'S', -1.4652633398537678}
};
using (var sr = new StreamReader(Assembly.GetExecutingAssembly().GetManifestResourceStream("Jieba.Net.Segmenter.Resources.prob_trans.json")))
{
var transJson = sr.ReadToEnd();
_transProbs = JsonSerializer.DeserializeFromString<IDictionary<char, IDictionary<char, double>>>(transJson);
}
using (var sr = new StreamReader(Assembly.GetExecutingAssembly().GetManifestResourceStream("Jieba.Net.Segmenter.Resources.prob_emit.json")))
{
var emitJson = sr.ReadToEnd();
_emitProbs = JsonSerializer.DeserializeFromString<IDictionary<char, IDictionary<char, double>>>(emitJson);
}
stopWatch.Stop();
Debug.WriteLine("model loading finished, time elapsed {0} ms.", stopWatch.ElapsedMilliseconds);
}
private IEnumerable<string> ViterbiCut(string sentence)
{
var v = new List<IDictionary<char, double>>();
IDictionary<char, Node> path = new Dictionary<char, Node>();
// Init weights and paths.
v.Add(new Dictionary<char, double>());
foreach (var state in States)
{
var emP = _emitProbs[state].GetDefault(sentence[0], Constants.MinProb);
v[0][state] = _startProbs[state] + emP;
path[state] = new Node(state, null);
}
// For each remaining char
for (var i = 1; i < sentence.Length; ++i)
{
IDictionary<char, double> vv = new Dictionary<char, double>();
v.Add(vv);
IDictionary<char, Node> newPath = new Dictionary<char, Node>();
foreach (var y in States)
{
var emp = _emitProbs[y].GetDefault(sentence[i], Constants.MinProb);
Pair<char> candidate = new Pair<char>('\0', double.MinValue);
foreach (var y0 in _prevStatus[y])
{
var tranp = _transProbs[y0].GetDefault(y, Constants.MinProb);
tranp = v[i - 1][y0] + tranp + emp;
if (candidate.Freq <= tranp)
{
candidate.Freq = tranp;
candidate.Key = y0;
}
}
vv[y] = candidate.Freq;
newPath[y] = new Node(y, path[candidate.Key]);
}
path = newPath;
}
var probE = v[sentence.Length - 1]['E'];
var probS = v[sentence.Length - 1]['S'];
var finalPath = probE < probS ? path['S'] : path['E'];
var posList = new List<char>(sentence.Length);
while (finalPath != null)
{
posList.Add(finalPath.Value);
finalPath = finalPath.Parent;
}
posList.Reverse();
var tokens = new List<string>();
int begin = 0, next = 0;
for (var i = 0; i < sentence.Length; i++)
{
var pos = posList[i];
if (pos == 'B')
begin = i;
else if (pos == 'E')
{
tokens.Add(sentence.Sub(begin, i + 1));
next = i + 1;
}
else if (pos == 'S')
{
tokens.Add(sentence.Sub(i, i + 1));
next = i + 1;
}
}
if (next < sentence.Length)
{
tokens.Add(sentence.Substring(next));
}
return tokens;
}
#endregion
}
}
\ No newline at end of file
using System.Collections.Generic;
using System.Linq;
using JiebaNet.Segmenter.Common;
namespace JiebaNet.Segmenter
{
public class KeywordProcessor
{
// private readonly string _keyword = "_keyword_";
// private readonly ISet<char> _whiteSpaceChars = new HashSet<char>(".\t\n\a ,");
// private readonly bool CaseSensitive;
private readonly KeywordTrie KeywordTrie = new KeywordTrie();
private readonly ISet<char> NonWordBoundries =
new HashSet<char>("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_");
public bool CaseSensitive { get; }
public KeywordProcessor(bool caseSensitive = false)
{
CaseSensitive = caseSensitive;
}
public void AddKeyword(string keyword, string cleanName = null)
{
SetItem(keyword, cleanName);
}
public void AddKeywords(IEnumerable<string> keywords)
{
foreach (var keyword in keywords)
{
AddKeyword(keyword);
}
}
public void RemoveKeyword(string keyword)
{
if (!CaseSensitive)
{
keyword = keyword.ToLower();
}
KeywordTrie.Remove(keyword);
}
public void RemoveKeywords(IEnumerable<string> keywords)
{
foreach (var keyword in keywords)
{
RemoveKeyword(keyword);
}
}
public bool Contains(string word)
{
return GetItem(word).IsNotNull();
}
public IEnumerable<TextSpan> ExtractKeywordSpans(string sentence)
{
var keywordsExtracted = new List<TextSpan>();
if (sentence.IsEmpty())
{
return keywordsExtracted;
}
if (!CaseSensitive)
{
sentence = sentence.ToLower();
}
KeywordTrieNode currentState = KeywordTrie;
var seqStartPos = 0;
var seqEndPos = 0;
var resetCurrentDict = false;
var idx = 0;
var sentLen = sentence.Length;
while (idx < sentLen)
{
var ch = sentence[idx];
// when reaching a char that denote word end
if (!NonWordBoundries.Contains(ch))
{
// if current prefix is in trie
if (currentState.HasValue || currentState.HasChild(ch))
{
//string seqFound = null;
string longestFound = null;
var isLongerFound = false;
if (currentState.HasValue)
{
//seqFound = currentState.Value;
longestFound = currentState.Value;
seqEndPos = idx;
}
// re look for longest seq from this position
if (currentState.HasChild(ch))
{
var curStateContinued = currentState.GetChild(ch);
var idy = idx + 1;
while (idy < sentLen)
{
var innerCh = sentence[idy];
if (!NonWordBoundries.Contains(innerCh) && curStateContinued.HasValue)
{
longestFound = curStateContinued.Value;
seqEndPos = idy;
isLongerFound = true;
}
if(curStateContinued.HasChild(innerCh))
{
curStateContinued = curStateContinued.GetChild(innerCh);
}
else
{
break;
}
idy += 1;
}
if (idy == sentLen && curStateContinued.HasValue)
{
// end of sentence reached.
longestFound = curStateContinued.Value;
seqEndPos = idy;
isLongerFound = true;
}
if (isLongerFound)
{
idx = seqEndPos;
}
}
if (longestFound.IsNotEmpty())
{
keywordsExtracted.Add(new TextSpan(text: longestFound, start: seqStartPos, end: idx));
}
currentState = KeywordTrie;
resetCurrentDict = true;
}
else
{
currentState = KeywordTrie;
resetCurrentDict = true;
}
}
else if (currentState.HasChild(ch))
{
currentState = currentState.GetChild(ch);
}
else
{
currentState = KeywordTrie;
resetCurrentDict = true;
// skip to end of word
var idy = idx + 1;
while (idy < sentLen)
{
if (!NonWordBoundries.Contains(sentence[idy]))
{
break;
}
idy += 1;
}
idx = idy;
}
if (idx + 1 >= sentLen)
{
if (currentState.HasValue)
{
var seqFound = currentState.Value;
keywordsExtracted.Add(new TextSpan(text: seqFound, start: seqStartPos, end: sentLen));
}
}
idx += 1;
if (resetCurrentDict)
{
resetCurrentDict = false;
seqStartPos = idx;
}
}
return keywordsExtracted;
}
public IEnumerable<string> ExtractKeywords(string sentence, bool raw = false)
{
if (raw)
{
return ExtractKeywordSpans(sentence).Select(span => sentence.Sub(span.Start, span.End));
}
return ExtractKeywordSpans(sentence).Select(span => span.Text);
}
#region Private methods
private void SetItem(string keyword, string cleanName)
{
if (cleanName.IsEmpty() && keyword.IsNotEmpty())
{
cleanName = keyword;
}
if (keyword.IsNotEmpty() && cleanName.IsNotEmpty())
{
if (!CaseSensitive)
{
keyword = keyword.ToLower();
}
KeywordTrie[keyword] = cleanName;
}
}
private string GetItem(string word)
{
if (!CaseSensitive)
{
word = word.ToLower();
}
return KeywordTrie[word];
}
#endregion
}
}
\ No newline at end of file
namespace JiebaNet.Segmenter
{
public class Node
{
public char Value { get; private set; }
public Node Parent { get; private set; }
public Node(char value, Node parent)
{
Value = value;
Parent = parent;
}
}
}
\ No newline at end of file
namespace JiebaNet.Segmenter
{
public class Pair<TKey>
{
public TKey Key { get;set; }
public double Freq { get; set; }
public Pair(TKey key, double freq)
{
Key = key;
Freq = freq;
}
public override string ToString()
{
return "Candidate [Key=" + Key + ", Freq=" + Freq + "]";
}
}
}
namespace JiebaNet.Segmenter.PosSeg
{
public class Pair
{
public string Word { get; set; }
public string Flag { get; set; }
public Pair(string word, string flag)
{
Word = word;
Flag = flag;
}
public override string ToString()
{
return string.Format("{0}/{1}", Word, Flag);
}
}
}
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Reflection;
using JiebaNet.Segmenter.Common;
using Kivii.Text;
namespace JiebaNet.Segmenter.PosSeg
{
public class Viterbi
{
private static readonly Lazy<Viterbi> Lazy = new Lazy<Viterbi>(() => new Viterbi());
private static IDictionary<string, double> _startProbs;
private static IDictionary<string, IDictionary<string, double>> _transProbs;
private static IDictionary<string, IDictionary<char, double>> _emitProbs;
private static IDictionary<char, List<string>> _stateTab;
private Viterbi()
{
LoadModel();
}
// TODO: synchronized
public static Viterbi Instance
{
get { return Lazy.Value; }
}
public IEnumerable<Pair> Cut(string sentence)
{
var probPosList = ViterbiCut(sentence);
var posList = probPosList.Item2;
var tokens = new List<Pair>();
int begin = 0, next = 0;
for (var i = 0; i < sentence.Length; i++)
{
var parts = posList[i].Split('-');
var charState = parts[0][0];
var pos = parts[1];
if (charState == 'B')
begin = i;
else if (charState == 'E')
{
tokens.Add(new Pair(sentence.Sub(begin, i + 1), pos));
next = i + 1;
}
else if (charState == 'S')
{
tokens.Add(new Pair(sentence.Sub(i, i + 1), pos));
next = i + 1;
}
}
if (next < sentence.Length)
{
tokens.Add(new Pair(sentence.Substring(next), posList[next].Split('-')[1]));
}
return tokens;
}
#region Private Helpers
private static void LoadModel()
{
using (var sr = new StreamReader(Assembly.GetExecutingAssembly().GetManifestResourceStream("Jieba.Net.Segmenter.Resources.pos_prob_start.json")))
{
var startJson = sr.ReadToEnd();
_startProbs = JsonSerializer.DeserializeFromString<IDictionary<string, double>>(startJson);
}
using (var sr = new StreamReader(Assembly.GetExecutingAssembly().GetManifestResourceStream("Jieba.Net.Segmenter.Resources.pos_prob_trans.json")))
{
var transJson = sr.ReadToEnd();
_transProbs = JsonSerializer.DeserializeFromString<IDictionary<string, IDictionary<string, double>>>(transJson);
}
using (var sr = new StreamReader(Assembly.GetExecutingAssembly().GetManifestResourceStream("Jieba.Net.Segmenter.Resources.pos_prob_emit.json")))
{
var emitJson = sr.ReadToEnd();
_emitProbs = JsonSerializer.DeserializeFromString<IDictionary<string, IDictionary<char, double>>>(emitJson);
}
using (var sr = new StreamReader(Assembly.GetExecutingAssembly().GetManifestResourceStream("Jieba.Net.Segmenter.Resources.char_state_tab.json")))
{
var tabJson = sr.ReadToEnd();
_stateTab = JsonSerializer.DeserializeFromString<IDictionary<char, List<string>>>(tabJson);
}
}
// TODO: change sentence to obs?
private Tuple<double, List<string>> ViterbiCut(string sentence)
{
var v = new List<IDictionary<string, double>>();
var memPath = new List<IDictionary<string, string>>();
var allStates = _transProbs.Keys.ToList();
// Init weights and paths.
v.Add(new Dictionary<string, Double>());
memPath.Add(new Dictionary<string, string>());
foreach (var state in _stateTab.GetDefault(sentence[0], allStates))
{
var emP = _emitProbs[state].GetDefault(sentence[0], Constants.MinProb);
v[0][state] = _startProbs[state] + emP;
memPath[0][state] = string.Empty;
}
// For each remaining char
for (var i = 1; i < sentence.Length; ++i)
{
v.Add(new Dictionary<string, double>());
memPath.Add(new Dictionary<string, string>());
var prevStates = memPath[i - 1].Keys.Where(k => _transProbs[k].Count > 0);
var curPossibleStates = new HashSet<string>(prevStates.SelectMany(s => _transProbs[s].Keys));
IEnumerable<string> obsStates = _stateTab.GetDefault(sentence[i], allStates);
obsStates = curPossibleStates.Intersect(obsStates);
if (!obsStates.Any())
{
if (curPossibleStates.Count > 0)
{
obsStates = curPossibleStates;
}
else
{
obsStates = allStates;
}
}
foreach (var y in obsStates)
{
var emp = _emitProbs[y].GetDefault(sentence[i], Constants.MinProb);
var prob = double.MinValue;
var state = string.Empty;
foreach (var y0 in prevStates)
{
var tranp = _transProbs[y0].GetDefault(y, double.MinValue);
tranp = v[i - 1][y0] + tranp + emp;
// TODO: compare two very small values;
// TODO: how to deal with negative infinity
if (prob < tranp ||
(prob == tranp && string.Compare(state, y0, StringComparison.InvariantCulture) < 0))
{
prob = tranp;
state = y0;
}
}
v[i][y] = prob;
memPath[i][y] = state;
}
}
var vLast = v.Last();
var last = memPath.Last().Keys.Select(y => new {State = y, Prob = vLast[y]});
var endProb = double.MinValue;
var endState = string.Empty;
foreach (var endPoint in last)
{
// TODO: compare two very small values;
if (endProb < endPoint.Prob ||
(endProb == endPoint.Prob && String.Compare(endState, endPoint.State, StringComparison.InvariantCulture) < 0))
{
endProb = endPoint.Prob;
endState = endPoint.State;
}
}
var route = new string[sentence.Length];
var n = sentence.Length - 1;
var curState = endState;
while(n >= 0)
{
route[n] = curState;
curState = memPath[n][curState];
n--;
}
return new Tuple<double, List<string>>(endProb, route.ToList());
}
#endregion
}
}
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
{
"E-e": -3.14e+100,
"E-d": -3.14e+100,
"E-g": -3.14e+100,
"E-f": -3.14e+100,
"E-a": -3.14e+100,
"E-c": -3.14e+100,
"E-b": -3.14e+100,
"E-m": -3.14e+100,
"S-rg": -10.275268591948773,
"E-o": -3.14e+100,
"E-n": -3.14e+100,
"E-i": -3.14e+100,
"E-h": -3.14e+100,
"E-k": -3.14e+100,
"E-j": -3.14e+100,
"E-u": -3.14e+100,
"E-t": -3.14e+100,
"E-w": -3.14e+100,
"E-v": -3.14e+100,
"E-q": -3.14e+100,
"E-p": -3.14e+100,
"E-s": -3.14e+100,
"M-bg": -3.14e+100,
"M-uj": -3.14e+100,
"E-y": -3.14e+100,
"E-x": -3.14e+100,
"E-z": -3.14e+100,
"B-uz": -3.14e+100,
"S-d": -3.903919764181873,
"M-rg": -3.14e+100,
"E-nt": -3.14e+100,
"B-d": -3.9750475297585357,
"B-uv": -3.14e+100,
"E-vi": -3.14e+100,
"B-mq": -6.78695300139688,
"M-rr": -3.14e+100,
"S-ag": -6.954113917960154,
"M-jn": -3.14e+100,
"E-l": -3.14e+100,
"M-rz": -3.14e+100,
"B-ud": -3.14e+100,
"S-an": -12.84021794941031,
"B-qg": -3.14e+100,
"B-ug": -3.14e+100,
"M-y": -3.14e+100,
"S-qg": -3.14e+100,
"S-z": -3.14e+100,
"S-y": -6.1970794699489575,
"S-x": -8.427419656069674,
"S-w": -3.14e+100,
"S-v": -3.053292303412302,
"S-u": -6.940320595827818,
"S-t": -3.14e+100,
"B-nrt": -4.985642733519195,
"S-r": -2.7635336784127853,
"S-q": -4.888658618255058,
"M-zg": -3.14e+100,
"S-o": -8.464460927750023,
"S-n": -3.8551483897645107,
"B-zg": -3.14e+100,
"S-l": -3.14e+100,
"S-k": -6.940320595827818,
"S-in": -3.14e+100,
"S-i": -3.14e+100,
"S-h": -8.650563207383884,
"S-g": -6.507826815331734,
"B-f": -5.491630418482717,
"S-e": -5.942513006281674,
"M-en": -3.14e+100,
"S-c": -4.786966795861212,
"S-b": -6.472888763970454,
"S-a": -3.9025396831295227,
"B-g": -3.14e+100,
"B-b": -5.018374362109218,
"B-c": -3.423880184954888,
"M-ug": -3.14e+100,
"B-a": -4.762305214596967,
"E-qe": -3.14e+100,
"M-x": -3.14e+100,
"E-nz": -3.14e+100,
"M-z": -3.14e+100,
"M-u": -3.14e+100,
"B-k": -3.14e+100,
"M-w": -3.14e+100,
"B-jn": -3.14e+100,
"S-yg": -13.533365129970255,
"B-o": -8.433498702146057,
"B-l": -4.905883584659895,
"B-m": -3.6524299819046386,
"M-m": -3.14e+100,
"M-l": -3.14e+100,
"M-o": -3.14e+100,
"M-n": -3.14e+100,
"M-i": -3.14e+100,
"M-h": -3.14e+100,
"B-t": -3.3647479094528574,
"M-ul": -3.14e+100,
"B-z": -7.045681111485645,
"M-d": -3.14e+100,
"M-mg": -3.14e+100,
"B-y": -9.844485675856319,
"M-a": -3.14e+100,
"S-nrt": -3.14e+100,
"M-c": -3.14e+100,
"M-uz": -3.14e+100,
"E-mg": -3.14e+100,
"B-i": -6.1157847275557105,
"M-b": -3.14e+100,
"E-uz": -3.14e+100,
"B-n": -1.6966257797548328,
"E-uv": -3.14e+100,
"M-ud": -3.14e+100,
"M-p": -3.14e+100,
"E-ul": -3.14e+100,
"E-mq": -3.14e+100,
"M-s": -3.14e+100,
"M-yg": -3.14e+100,
"E-uj": -3.14e+100,
"E-ud": -3.14e+100,
"S-ln": -3.14e+100,
"M-r": -3.14e+100,
"E-ng": -3.14e+100,
"B-r": -3.4098187790818413,
"E-en": -3.14e+100,
"M-qg": -3.14e+100,
"B-s": -5.522673590839954,
"S-rr": -3.14e+100,
"B-p": -4.200984132085048,
"B-dg": -3.14e+100,
"M-uv": -3.14e+100,
"S-zg": -3.14e+100,
"B-v": -2.6740584874265685,
"S-tg": -6.272842531880403,
"B-w": -3.14e+100,
"B-e": -8.563551830394255,
"M-k": -3.14e+100,
"M-j": -3.14e+100,
"B-df": -8.888974230828882,
"M-e": -3.14e+100,
"E-tg": -3.14e+100,
"M-t": -3.14e+100,
"E-nr": -3.14e+100,
"M-nrfg": -3.14e+100,
"B-nr": -2.2310495913769506,
"E-df": -3.14e+100,
"E-dg": -3.14e+100,
"S-jn": -3.14e+100,
"M-q": -3.14e+100,
"B-mg": -3.14e+100,
"B-ln": -3.14e+100,
"M-f": -3.14e+100,
"E-ln": -3.14e+100,
"E-yg": -3.14e+100,
"S-bg": -3.14e+100,
"E-ns": -3.14e+100,
"B-tg": -3.14e+100,
"E-qg": -3.14e+100,
"S-nr": -4.483663103956885,
"S-ns": -3.14e+100,
"M-vn": -3.14e+100,
"S-nt": -12.147070768850364,
"S-nz": -3.14e+100,
"S-ad": -11.048458480182255,
"B-yg": -3.14e+100,
"M-v": -3.14e+100,
"E-vn": -3.14e+100,
"S-ng": -4.913434861102905,
"M-g": -3.14e+100,
"M-nt": -3.14e+100,
"S-en": -3.14e+100,
"M-nr": -3.14e+100,
"M-ns": -3.14e+100,
"S-vq": -3.14e+100,
"B-uj": -3.14e+100,
"M-nz": -3.14e+100,
"B-qe": -3.14e+100,
"M-in": -3.14e+100,
"M-ng": -3.14e+100,
"S-vn": -11.453923588290419,
"E-zg": -3.14e+100,
"S-vi": -3.14e+100,
"S-vg": -5.9430181843676895,
"S-vd": -3.14e+100,
"B-ad": -6.680066036784177,
"E-rz": -3.14e+100,
"B-ag": -3.14e+100,
"B-vd": -9.044728760238115,
"S-mq": -3.14e+100,
"B-vi": -12.434752841302146,
"E-rr": -3.14e+100,
"B-rr": -12.434752841302146,
"M-vq": -3.14e+100,
"E-jn": -3.14e+100,
"B-vn": -4.3315610890163585,
"S-mg": -10.825314928868044,
"B-in": -3.14e+100,
"M-vi": -3.14e+100,
"M-an": -3.14e+100,
"M-vd": -3.14e+100,
"B-rg": -3.14e+100,
"M-vg": -3.14e+100,
"M-ad": -3.14e+100,
"M-ag": -3.14e+100,
"E-rg": -3.14e+100,
"S-uz": -9.299258625372996,
"B-en": -3.14e+100,
"S-uv": -8.15808672228609,
"S-df": -3.14e+100,
"S-dg": -8.948397651299683,
"M-qe": -3.14e+100,
"B-ng": -3.14e+100,
"E-bg": -3.14e+100,
"S-ul": -8.4153713175535,
"S-uj": -6.85251045118004,
"S-ug": -7.5394037026636855,
"B-ns": -2.8228438314969213,
"S-ud": -7.728230161053767,
"B-nt": -4.846091668182416,
"B-ul": -3.14e+100,
"E-in": -3.14e+100,
"B-bg": -3.14e+100,
"M-df": -3.14e+100,
"M-dg": -3.14e+100,
"M-nrt": -3.14e+100,
"B-j": -5.0576191284681915,
"E-ug": -3.14e+100,
"E-vq": -3.14e+100,
"B-vg": -3.14e+100,
"B-nz": -3.94698846057672,
"S-qe": -3.14e+100,
"B-rz": -7.946116471570005,
"B-nrfg": -5.873722175405573,
"E-ad": -3.14e+100,
"E-ag": -3.14e+100,
"B-u": -9.163917277503234,
"M-ln": -3.14e+100,
"B-an": -8.697083223018778,
"M-mq": -3.14e+100,
"E-an": -3.14e+100,
"S-s": -3.14e+100,
"B-q": -6.998123858956596,
"E-nrt": -3.14e+100,
"B-h": -13.533365129970255,
"E-r": -3.14e+100,
"S-p": -2.9868401813596317,
"M-tg": -3.14e+100,
"S-rz": -3.14e+100,
"S-nrfg": -3.14e+100,
"B-vq": -12.147070768850364,
"B-x": -3.14e+100,
"E-vd": -3.14e+100,
"E-nrfg": -3.14e+100,
"S-m": -3.269200652116097,
"E-vg": -3.14e+100,
"S-f": -5.194820249981676,
"S-j": -4.911992119644354
}
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
{
"M": {
"M": -1.2603623820268226,
"E": -0.33344856811948514
},
"S": {
"S": -0.6658631448798212,
"B": -0.7211965654669841
},
"B": {
"M": -0.916290731874155,
"E": -0.51082562376599
},
"E": {
"S": -0.8085250474669937,
"B": -0.5897149736854513
}
}
\ No newline at end of file
i
me
my
myself
we
our
ours
ourselves
you
your
yours
yourself
yourselves
he
him
his
himself
she
her
hers
herself
it
its
itself
they
them
their
theirs
themselves
what
which
who
whom
this
that
these
those
am
is
are
was
were
be
been
being
have
has
had
having
do
does
did
doing
a
an
the
and
but
if
or
because
as
until
while
of
at
by
for
with
about
against
between
into
through
during
before
after
above
below
to
from
up
down
in
out
on
off
over
under
again
further
then
once
here
there
when
where
why
how
all
any
both
each
few
more
most
other
some
such
no
nor
not
only
own
same
so
than
too
very
s
t
can
will
just
don
should
now
一番
一直
一个
一些
许多
有的是
也就是说
哎呀
哎哟
俺们
按照
吧哒
罢了
本着
比方
比如
鄙人
彼此
别的
别说
并且
不比
不成
不单
不但
不独
不管
不光
不过
不仅
不拘
不论
不怕
不然
不如
不特
不惟
不问
不只
朝着
趁着
除此之外
除非
除了
此间
此外
从而
但是
当着
的话
等等
叮咚
对于
多少
而况
而且
而是
而外
而言
而已
尔后
反过来
反过来说
反之
非但
非徒
否则
嘎登
各个
各位
各种
各自
根据
故此
固然
关于
果然
果真
何处
何况
何时
哼唷
呼哧
还是
还有
换句话说
换言之
或是
或者
极了
及其
及至
即便
即或
即令
即若
即使
几时
既然
既是
继而
加之
假如
假若
假使
鉴于
较之
接着
结果
紧接着
进而
尽管
经过
就是
就是说
具体地说
具体说来
开始
开外
可见
可是
可以
况且
来着
例如
连同
两者
另外
另一方面
慢说
漫说
每当
莫若
某个
某些
哪边
哪儿
哪个
哪里
哪年
哪怕
哪天
哪些
哪样
那边
那儿
那个
那会儿
那里
那么
那么些
那么样
那时
那些
那样
乃至
你们
宁可
宁肯
宁愿
啪达
旁人
凭借
其次
其二
其他
其它
其一
其余
其中
起见
起见
岂但
恰恰相反
前后
前者
然而
然后
然则
人家
任何
任凭
如此
如果
如何
如其
如若
如上所述
若非
若是
上下
尚且
设若
设使
甚而
甚么
甚至
省得
时候
什么
什么样
使得
是的
首先
顺着
似的
虽然
虽说
虽则
随着
所以
他们
他人
它们
她们
倘或
倘然
倘若
倘使
通过
同时
万一
为何
为了
为什么
为着
嗡嗡
我们
呜呼
乌乎
无论
无宁
毋宁
相对而言
向着
沿
沿着
要不
要不然
要不是
要么
要是
也罢
也好
一旦
一方面
一来
一切
一样
一则
依照
以便
以及
以免
以至
以至于
以致
抑或
因此
因而
因为
由此可见
由于
有的
有关
有些
于是
于是乎
与此同时
与否
与其
越是
云云
再说
再者
在下
咱们
怎么办
怎么样
照着
这边
这儿
这个
这会儿
这就是说
这里
这么
这么点儿
这么些
这么样
这时
这些
这样
正如
之类
之所以
之一
只是
只限
只要
只有
至于
诸位
着呢
自从
自个儿
自各儿
自己
自家
自身
综上所述
总的来看
总的来说
总的说来
总而言之
总之
纵令
纵然
纵使
遵照
作为
喔唷
.
,
:
;
"
"
[
]
<
>
(
)
@
#
*
&
%
$
-
+
=
|
\
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This diff is collapsed. Click to expand it.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment