Commit 9993a834 by 陶然

init

parent 91845da6
...@@ -9,6 +9,7 @@ namespace LDA.LdaModel ...@@ -9,6 +9,7 @@ namespace LDA.LdaModel
public int K; //#Topics public int K; //#Topics
public double alpha; // Dirichlet Prior Parameter for Document->Topic public double alpha; // Dirichlet Prior Parameter for Document->Topic
public double beta; // Dirichlet Prior Parameter for Topic->Word public double beta; // Dirichlet Prior Parameter for Topic->Word
public double logLikelihood;
public double LogLikelihood public double LogLikelihood
{ {
get get
......
...@@ -130,26 +130,27 @@ namespace LDA.LdaModel ...@@ -130,26 +130,27 @@ namespace LDA.LdaModel
getLda.M = M; getLda.M = M;
getLda.phi = phi; getLda.phi = phi;
getLda.theta = theta; getLda.theta = theta;
getLda.logLikelihood = LogLikelihood;
return getLda; return getLda;
} }
public void PrintModelInfo() public void PrintModelInfo()
{ {
Console.WriteLine("Aplha: " + alpha.ToString()); //Console.WriteLine("Aplha: " + alpha.ToString());
Console.WriteLine("Beta: " + beta.ToString()); //Console.WriteLine("Beta: " + beta.ToString());
Console.WriteLine("M: " + M); //Console.WriteLine("M: " + M);
Console.WriteLine("K: " + K); //Console.WriteLine("K: " + K);
Console.WriteLine("V: " + V); //Console.WriteLine("V: " + V);
Console.WriteLine("Total iterations:" + niters); //Console.WriteLine("Total iterations:" + niters);
Console.WriteLine("Save at: " + savestep); //Console.WriteLine("Save at: " + savestep);
Console.WriteLine(); //Console.WriteLine();
} }
private void GibbsSampling(int totalIter) private void GibbsSampling(int totalIter)
{ {
for (int iter = 1; iter <= totalIter; iter++) for (int iter = 1; iter <= totalIter; iter++)
{ {
Console.Write("Iteration " + iter + ":"); //Console.Write("Iteration " + iter + ":");
var stopWatch = new Stopwatch(); var stopWatch = new Stopwatch();
stopWatch.Start(); stopWatch.Start();
for (int i = 0; i < wn; i++) for (int i = 0; i < wn; i++)
...@@ -159,7 +160,7 @@ namespace LDA.LdaModel ...@@ -159,7 +160,7 @@ namespace LDA.LdaModel
} }
stopWatch.Stop(); stopWatch.Stop();
Console.WriteLine(stopWatch.ElapsedMilliseconds / 1000.0 + " seconds"); //Console.WriteLine(stopWatch.ElapsedMilliseconds / 1000.0 + " seconds");
if (iter % savestep == 0) if (iter % savestep == 0)
{ {
//保存参数、文档分布等 //保存参数、文档分布等
...@@ -195,7 +196,7 @@ namespace LDA.LdaModel ...@@ -195,7 +196,7 @@ namespace LDA.LdaModel
} }
//主题编号 //主题编号
//sw.Write("Topic " + k + "th:\n"); //sw.Write("Topic " + k + "th:\n");
Console.WriteLine("Topic " + k + "th:\n"); //Console.WriteLine("Topic " + k + "th:\n");
//主题词 //主题词
var wordsProbsListOrdered = wordsProbsList.OrderBy(e => -e.Value).ToList(); var wordsProbsListOrdered = wordsProbsList.OrderBy(e => -e.Value).ToList();
...@@ -203,11 +204,11 @@ namespace LDA.LdaModel ...@@ -203,11 +204,11 @@ namespace LDA.LdaModel
{ {
string word = cor.GetStringByID(wordsProbsListOrdered[i].Key); string word = cor.GetStringByID(wordsProbsListOrdered[i].Key);
// sw.WriteLine("\t" + word + " " + wordsProbsListOrdered[i].Value); // sw.WriteLine("\t" + word + " " + wordsProbsListOrdered[i].Value);
Console.WriteLine("\t" + word + " " + wordsProbsListOrdered[i].Value); //Console.WriteLine("\t" + word + " " + wordsProbsListOrdered[i].Value);
} }
} }
Console.WriteLine("LogLikelihood= " + LogLikelihood); //Console.WriteLine("LogLikelihood= " + LogLikelihood);
} }
} }
} }
......
...@@ -6,32 +6,32 @@ using System.Threading.Tasks; ...@@ -6,32 +6,32 @@ using System.Threading.Tasks;
namespace LDA.LdaModel namespace LDA.LdaModel
{ {
class TopicSimilarityCos public static class TopicSimilarityCos
{ {
public double SimilarityCos() public static double SimilarityCos(List<string> lstr1, List<string> lstr2)
{ {
List<string> lstr1 = new List<string>(); //List<string> lstr1 = new List<string>();
lstr1.Add("稀疏"); //lstr1.Add("稀疏");
lstr1.Add("信号"); //lstr1.Add("信号");
lstr1.Add("算法"); //lstr1.Add("算法");
lstr1.Add("方法"); //lstr1.Add("方法");
lstr1.Add("轴承"); //lstr1.Add("轴承");
lstr1.Add("故障"); //lstr1.Add("故障");
lstr1.Add("贝叶斯"); //lstr1.Add("贝叶斯");
lstr1.Add("模型"); //lstr1.Add("模型");
lstr1.Add("发动机"); //lstr1.Add("发动机");
lstr1.Add("降低"); //lstr1.Add("降低");
List<string> lstr2 = new List<string>(); //List<string> lstr2 = new List<string>();
lstr2.Add("测量"); //lstr2.Add("测量");
lstr2.Add("发动机"); //lstr2.Add("发动机");
lstr2.Add("叶尖"); //lstr2.Add("叶尖");
lstr2.Add("间隙"); //lstr2.Add("间隙");
lstr2.Add("性能"); //lstr2.Add("性能");
lstr2.Add("控制"); //lstr2.Add("控制");
lstr2.Add("微波"); //lstr2.Add("微波");
lstr2.Add("叶片"); //lstr2.Add("叶片");
lstr2.Add("降低"); //lstr2.Add("降低");
lstr2.Add("自主"); //lstr2.Add("自主");
//求并集 //求并集
var strUnion = lstr1.Union(lstr2); var strUnion = lstr1.Union(lstr2);
...@@ -58,8 +58,7 @@ namespace LDA.LdaModel ...@@ -58,8 +58,7 @@ namespace LDA.LdaModel
//求分母(2) //求分母(2)
den2 += Math.Pow(int2[i], 2); den2 += Math.Pow(int2[i], 2);
} }
double cos = s / (Math.Sqrt(den1) * Math.Sqrt(den2)); //Console.WriteLine(cos);
Console.WriteLine(cos);
return s / (Math.Sqrt(den1) * Math.Sqrt(den2)); return s / (Math.Sqrt(den1) * Math.Sqrt(den2));
} }
} }
......
...@@ -28,7 +28,7 @@ namespace Njust.Pdf.Analysis.Entities ...@@ -28,7 +28,7 @@ namespace Njust.Pdf.Analysis.Entities
public string SourceTopic { get; set; } public string SourceTopic { get; set; }
[ApiMember(Description = "目标主题编号")] [ApiMember(Description = "目标主题编号")]
public string TargeTopic { get; set; } public string TargetTopic { get; set; }
[ApiMember(Description = "源主题词")] [ApiMember(Description = "源主题词")]
public string SourceTopicWord { get; set; } public string SourceTopicWord { get; set; }
...@@ -37,6 +37,6 @@ namespace Njust.Pdf.Analysis.Entities ...@@ -37,6 +37,6 @@ namespace Njust.Pdf.Analysis.Entities
public string TargetTopicWord { get; set; } public string TargetTopicWord { get; set; }
[ApiMember(Description = "余弦相似度")] [ApiMember(Description = "余弦相似度")]
public string CosSim { get; set; } public double CosSim { get; set; }
} }
} }
...@@ -116,9 +116,11 @@ namespace Njust.Pdf.Analysis.Tranforms ...@@ -116,9 +116,11 @@ namespace Njust.Pdf.Analysis.Tranforms
foreach (var item in preInserts) foreach (var item in preInserts)
{ {
var stream = files[item.HashCode];
var exist = exists.FirstOrDefault(o => o.HashCode == item.HashCode); var exist = exists.FirstOrDefault(o => o.HashCode == item.HashCode);
if (exist != null) if (exist != null)
{ {
KiviiContext.VirtualFiles.WriteFile(exist.ImportPath, stream);
rtns.Results.Add(exist); rtns.Results.Add(exist);
continue; continue;
} }
...@@ -126,7 +128,6 @@ namespace Njust.Pdf.Analysis.Tranforms ...@@ -126,7 +128,6 @@ namespace Njust.Pdf.Analysis.Tranforms
conn.Insert(item); conn.Insert(item);
item.RemoveAllOnlyProperties(); item.RemoveAllOnlyProperties();
rtns.Results.Add(item); rtns.Results.Add(item);
var stream = files[item.HashCode];
stream.Position = 0; stream.Position = 0;
KiviiContext.VirtualFiles.WriteFile(item.ImportPath, stream); KiviiContext.VirtualFiles.WriteFile(item.ImportPath, stream);
......
...@@ -10,6 +10,7 @@ using System.IO; ...@@ -10,6 +10,7 @@ using System.IO;
using System.Linq; using System.Linq;
using System.Reflection; using System.Reflection;
using System.Text; using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks; using System.Threading.Tasks;
namespace Njust.Pdf.Analysis.Tranforms namespace Njust.Pdf.Analysis.Tranforms
...@@ -433,125 +434,87 @@ namespace Njust.Pdf.Analysis.Tranforms ...@@ -433,125 +434,87 @@ namespace Njust.Pdf.Analysis.Tranforms
foreach (var kv in group) foreach (var kv in group)
{ {
List<String> englishtext = new List<string>(); if (kv.Key.Language.IsNullOrEmpty() || kv.Key.DomainName.IsNullOrEmpty() || kv.Key.PublishTime.IsNullOrEmpty()) continue;
List<String> chinesetext = new List<string>();
var items = kv.ToList(); var items = kv.ToList();
string[] lemmatizeText;
if (kv.Key.Language == "英文") if (kv.Key.Language == "英文")
{ {
for (var i = 0; i < items.Count(); i++) var englishtext = new List<string>();
{ foreach (var item in items)
englishtext.Add(items[i].Title + ". " + items[i].Abstract);
}
var EnText = LemmatizeEn.lemmatizeEnglish(englishtext);
CommandLineOption ldaoption = new CommandLineOption();
ldaoption.topics = (int)Math.Ceiling(System.Math.Sqrt(EnText.Length));
double normolK = 200;
ldaoption.alpha = ldaoption.topics / normolK;
ldaoption.beta = 0.5;
ldaoption.savestep = 500;
ldaoption.niters = 500;
ldaoption.twords = 10;
try
{
LDAGibbsSampling model = new LDAGibbsSampling();
Corpora cor = new Corpora();
cor.LoadDataFile(EnText);
var lda = model.TrainNewModel(cor, ldaoption);
for (int k = 0; k < lda.K; k++)
{
var wordsProbsList = new Dictionary<int, double>();
for (int w = 0; w < lda.V; w++)
{
wordsProbsList.Add(w, lda.phi[k][w]);
}
var wordsProbsListOrdered = wordsProbsList.OrderBy(e => -e.Value).ToList();
string word = "";
List<double> TopicWordsProbability = new List<double>(); ;
for (int i = 0; i < 10; i++)
{
word += cor.GetStringByID(wordsProbsListOrdered[i].Key) + "; ";
TopicWordsProbability.Add(wordsProbsListOrdered[i].Value);
}
var DisTimeTopic = new DisTimeTopic();
DisTimeTopic.Language = kv.Key.Language;
DisTimeTopic.DomainName = kv.Key.DomainName;
DisTimeTopic.Year = kv.Key.PublishTime;
DisTimeTopic.Topic = "Topic " + k.ToString();
DisTimeTopic.TopicWord = word;
DisTimeTopic.TopicWordProbability = TopicWordsProbability;
DisTimeTopic.Documentlist = lda.theta.ToList();
DisTimeTopic.Parameter = "Aplha: " + lda.alpha.ToString() + ";" + "Beta: " + lda.beta.ToString() + ";" + "文档数: " + lda.M.ToString() + ";" + "主题数: " + lda.K.ToString() + ";" + "词袋数: " + lda.V.ToString() + ";" + "迭代次数:" + ldaoption.niters.ToString() + ";" + "LogLikelihood= " + lda.LogLikelihood.ToString();
conn.Insert(DisTimeTopic);
DisTimeTopic.RemoveAllOnlyProperties();
rtns.Results.Add(DisTimeTopic);
}
}
catch (Exception ex)
{ {
Console.WriteLine(ex.StackTrace); englishtext.Add(item.Title + ". " + item.Abstract);
Console.WriteLine(ex.Message);
} }
lemmatizeText = LemmatizeEn.lemmatizeEnglish(englishtext);
} }
if (kv.Key.Language == "中文") else if (kv.Key.Language == "中文")
{ {
for (var i = 0; i < items.Count(); i++) var chinesetext = new List<string>();
foreach (var item in items)
{ {
chinesetext.Add(items[i].Title + "。" + items[i].Abstract); chinesetext.Add(item.Title + "。" + item.Abstract);
} }
var CnText = ChinesePreprocessing.ChineseWordSegmentation(chinesetext); lemmatizeText = ChinesePreprocessing.ChineseWordSegmentation(chinesetext);
CommandLineOption ldaoption = new CommandLineOption(); }
ldaoption.topics = (int)Math.Ceiling(System.Math.Sqrt(CnText.Length)); else continue;
double normolK = 200;
ldaoption.alpha = ldaoption.topics / normolK; CommandLineOption ldaoption = new CommandLineOption();
ldaoption.beta = 0.5; ldaoption.topics = (int)Math.Ceiling(System.Math.Sqrt(lemmatizeText.Length));
ldaoption.savestep = 500; double normolK = 200;
ldaoption.niters = 500; ldaoption.alpha = ldaoption.topics / normolK;
ldaoption.twords = 10; ldaoption.beta = 0.5;
try ldaoption.savestep = 500;
ldaoption.niters = 500;
ldaoption.twords = 10;
try
{
LDAGibbsSampling model = new LDAGibbsSampling();
Corpora cor = new Corpora();
cor.LoadDataFile(lemmatizeText);
var lda = model.TrainNewModel(cor, ldaoption);
for (int k = 0; k < lda.K; k++)
{ {
LDAGibbsSampling model = new LDAGibbsSampling(); var wordsProbsList = new Dictionary<int, double>();
Corpora cor = new Corpora();
cor.LoadDataFile(CnText); for (int w = 0; w < lda.V; w++)
var lda = model.TrainNewModel(cor, ldaoption);
for (int k = 0; k < lda.K; k++)
{ {
var wordsProbsList = new Dictionary<int, double>(); wordsProbsList.Add(w, lda.phi[k][w]);
for (int w = 0; w < lda.V; w++)
{
wordsProbsList.Add(w, lda.phi[k][w]);
}
var wordsProbsListOrdered = wordsProbsList.OrderBy(e => -e.Value).ToList();
string word = "";
List<double> TopicWordsProbability = new List<double>(); ;
for (int i = 0; i < 10; i++)
{
word += cor.GetStringByID(wordsProbsListOrdered[i].Key) + "; ";
TopicWordsProbability.Add(wordsProbsListOrdered[i].Value);
}
var DisTimeTopic = new DisTimeTopic();
DisTimeTopic.Language = kv.Key.Language;
DisTimeTopic.DomainName = kv.Key.DomainName;
DisTimeTopic.Year = kv.Key.PublishTime;
DisTimeTopic.Topic = "Topic " + k.ToString();
DisTimeTopic.TopicWord = word;
DisTimeTopic.TopicWordProbability = TopicWordsProbability;
DisTimeTopic.Documentlist = lda.theta.ToList();
DisTimeTopic.Parameter = "Aplha: " + lda.alpha.ToString() + ";" + "Beta: " + lda.beta.ToString() + ";" + "文档数: " + lda.M.ToString() + ";" + "主题数: " + lda.K.ToString() + ";" + "词袋数: " + lda.V.ToString() + ";" + "迭代次数:" + ldaoption.niters.ToString() + ";" + "LogLikelihood= " + lda.LogLikelihood.ToString();
conn.Insert(DisTimeTopic);
DisTimeTopic.RemoveAllOnlyProperties();
rtns.Results.Add(DisTimeTopic);
} }
} var wordsProbsListOrdereds = wordsProbsList.OrderBy(e => -e.Value).ToList();
catch (Exception ex) List<string> words = new List<string>();
{ List<double> TopicWordsProbability = new List<double>();
Console.WriteLine(ex.StackTrace); foreach (var wordsProbsListOrdered in wordsProbsListOrdereds)
Console.WriteLine(ex.Message); {
if (words.Count >= 10) break;
var word = cor.GetStringByID(wordsProbsListOrdered.Key);
if (word.IsNullOrEmpty()) continue;
words.Add(cor.GetStringByID(wordsProbsListOrdered.Key));
TopicWordsProbability.Add(wordsProbsListOrdered.Value);
}
var disTimeTopic = new DisTimeTopic();
disTimeTopic.Language = kv.Key.Language;
disTimeTopic.DomainName = kv.Key.DomainName;
var year= kv.Key.PublishTime.Replace(" ", "");
//提取多个数字,该方式会分别提取字符串中的数字,如:"ABC#123@AS456测试789"就会分别提取123、456、789
var reg = new Regex("[0-9]+", RegexOptions.IgnoreCase | RegexOptions.Singleline, TimeSpan.FromSeconds(2));
var mc = reg.Matches(year);
if (mc.Count <= 0) continue;
disTimeTopic.Year = mc[0].Value;
disTimeTopic.Topic = "Topic " + k.ToString();
//DisTimeTopic.TopicWord = word;
disTimeTopic.TopicWord = string.Join(";", words);
disTimeTopic.TopicWordProbability = TopicWordsProbability;
disTimeTopic.Documentlist = lda.theta.ToList();
disTimeTopic.Parameter = "Aplha: " + lda.alpha.ToString() + ";" + "Beta: " + lda.beta.ToString() + ";" + "文档数: " + lda.M.ToString() + ";" + "主题数: " + lda.K.ToString() + ";" + "词袋数: " + lda.V.ToString() + ";" + "迭代次数:" + ldaoption.niters.ToString() + ";" + "LogLikelihood= " + lda.logLikelihood.ToString();
conn.Insert(disTimeTopic);
disTimeTopic.RemoveAllOnlyProperties();
rtns.Results.Add(disTimeTopic);
} }
} }
catch (Exception ex)
{
Console.WriteLine(ex.StackTrace);
Console.WriteLine(ex.Message);
}
} }
rtns.Total = rtns.Results.Count(); rtns.Total = rtns.Results.Count();
return rtns; return rtns;
...@@ -566,6 +529,51 @@ namespace Njust.Pdf.Analysis.Tranforms ...@@ -566,6 +529,51 @@ namespace Njust.Pdf.Analysis.Tranforms
var rtns = new RestfulQueryResponse<DisTopicEvolution>(); var rtns = new RestfulQueryResponse<DisTopicEvolution>();
rtns.Results = new List<DisTopicEvolution>(); rtns.Results = new List<DisTopicEvolution>();
var conn = KiviiContext.GetOpenedDbConnection<DisTimeTopic>();
var query = conn.From<DisTimeTopic>();
query.OrderBy(o => o.Year);
var allDisTimeTopics = conn.Select(query);
var group = allDisTimeTopics.GroupBy(o => new { o.Year, o.DomainName, o.Language }).ToList();
conn.InitEntityType<DisTopicEvolution>();
conn.Delete<DisTopicEvolution>(o => o.Kvid == o.Kvid);
foreach(var kv in group)
{
var index = group.IndexOf(kv);
if (index + 1 == group.Count()) continue;
foreach(var item in kv)
{
var nextGroup = group.FirstOrDefault(o => o.Key.Language == item.Language & o.Key.DomainName == item.DomainName & o.Key.Year.ToInt() > item.Year.ToInt());
if (nextGroup == null) continue;
var sourceTopicWords = item.TopicWord.Split(';').ToList();
var disTopicEvolution = new DisTopicEvolution();
disTopicEvolution.DomainName = item.DomainName;
disTopicEvolution.Language = item.Language;
disTopicEvolution.SourceYear = item.Year;
disTopicEvolution.SourceTopic = item.Topic;
disTopicEvolution.SourceTopicWord = item.TopicWord;
foreach (var next in nextGroup)
{
var targetTopicWords = next.TopicWord.Split(';').ToList();
var cosSim = TopicSimilarityCos.SimilarityCos(sourceTopicWords, targetTopicWords);
if (disTopicEvolution.CosSim <= cosSim)
{
disTopicEvolution.CosSim = cosSim;
disTopicEvolution.TargetTopic = next.Topic;
disTopicEvolution.TargetYear = next.Year;
disTopicEvolution.TargetTopicWord = next.TopicWord;
}
}
conn.Insert(disTopicEvolution);
disTopicEvolution.RemoveAllOnlyProperties();
rtns.Results.Add(disTopicEvolution);
}
}
rtns.Total = rtns.Results.Count();
return rtns; return rtns;
} }
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment