Commit 9993a834 by 陶然

init

parent 91845da6
......@@ -9,6 +9,7 @@ namespace LDA.LdaModel
public int K; //#Topics
public double alpha; // Dirichlet Prior Parameter for Document->Topic
public double beta; // Dirichlet Prior Parameter for Topic->Word
public double logLikelihood;
public double LogLikelihood
{
get
......
......@@ -130,26 +130,27 @@ namespace LDA.LdaModel
getLda.M = M;
getLda.phi = phi;
getLda.theta = theta;
getLda.logLikelihood = LogLikelihood;
return getLda;
}
public void PrintModelInfo()
{
Console.WriteLine("Aplha: " + alpha.ToString());
Console.WriteLine("Beta: " + beta.ToString());
Console.WriteLine("M: " + M);
Console.WriteLine("K: " + K);
Console.WriteLine("V: " + V);
Console.WriteLine("Total iterations:" + niters);
Console.WriteLine("Save at: " + savestep);
Console.WriteLine();
//Console.WriteLine("Aplha: " + alpha.ToString());
//Console.WriteLine("Beta: " + beta.ToString());
//Console.WriteLine("M: " + M);
//Console.WriteLine("K: " + K);
//Console.WriteLine("V: " + V);
//Console.WriteLine("Total iterations:" + niters);
//Console.WriteLine("Save at: " + savestep);
//Console.WriteLine();
}
private void GibbsSampling(int totalIter)
{
for (int iter = 1; iter <= totalIter; iter++)
{
Console.Write("Iteration " + iter + ":");
//Console.Write("Iteration " + iter + ":");
var stopWatch = new Stopwatch();
stopWatch.Start();
for (int i = 0; i < wn; i++)
......@@ -159,7 +160,7 @@ namespace LDA.LdaModel
}
stopWatch.Stop();
Console.WriteLine(stopWatch.ElapsedMilliseconds / 1000.0 + " seconds");
//Console.WriteLine(stopWatch.ElapsedMilliseconds / 1000.0 + " seconds");
if (iter % savestep == 0)
{
//保存参数、文档分布等
......@@ -195,7 +196,7 @@ namespace LDA.LdaModel
}
//主题编号
//sw.Write("Topic " + k + "th:\n");
Console.WriteLine("Topic " + k + "th:\n");
//Console.WriteLine("Topic " + k + "th:\n");
//主题词
var wordsProbsListOrdered = wordsProbsList.OrderBy(e => -e.Value).ToList();
......@@ -203,11 +204,11 @@ namespace LDA.LdaModel
{
string word = cor.GetStringByID(wordsProbsListOrdered[i].Key);
// sw.WriteLine("\t" + word + " " + wordsProbsListOrdered[i].Value);
Console.WriteLine("\t" + word + " " + wordsProbsListOrdered[i].Value);
//Console.WriteLine("\t" + word + " " + wordsProbsListOrdered[i].Value);
}
}
Console.WriteLine("LogLikelihood= " + LogLikelihood);
//Console.WriteLine("LogLikelihood= " + LogLikelihood);
}
}
}
......
......@@ -6,32 +6,32 @@ using System.Threading.Tasks;
namespace LDA.LdaModel
{
class TopicSimilarityCos
public static class TopicSimilarityCos
{
public double SimilarityCos()
public static double SimilarityCos(List<string> lstr1, List<string> lstr2)
{
List<string> lstr1 = new List<string>();
lstr1.Add("稀疏");
lstr1.Add("信号");
lstr1.Add("算法");
lstr1.Add("方法");
lstr1.Add("轴承");
lstr1.Add("故障");
lstr1.Add("贝叶斯");
lstr1.Add("模型");
lstr1.Add("发动机");
lstr1.Add("降低");
List<string> lstr2 = new List<string>();
lstr2.Add("测量");
lstr2.Add("发动机");
lstr2.Add("叶尖");
lstr2.Add("间隙");
lstr2.Add("性能");
lstr2.Add("控制");
lstr2.Add("微波");
lstr2.Add("叶片");
lstr2.Add("降低");
lstr2.Add("自主");
//List<string> lstr1 = new List<string>();
//lstr1.Add("稀疏");
//lstr1.Add("信号");
//lstr1.Add("算法");
//lstr1.Add("方法");
//lstr1.Add("轴承");
//lstr1.Add("故障");
//lstr1.Add("贝叶斯");
//lstr1.Add("模型");
//lstr1.Add("发动机");
//lstr1.Add("降低");
//List<string> lstr2 = new List<string>();
//lstr2.Add("测量");
//lstr2.Add("发动机");
//lstr2.Add("叶尖");
//lstr2.Add("间隙");
//lstr2.Add("性能");
//lstr2.Add("控制");
//lstr2.Add("微波");
//lstr2.Add("叶片");
//lstr2.Add("降低");
//lstr2.Add("自主");
//求并集
var strUnion = lstr1.Union(lstr2);
......@@ -58,8 +58,7 @@ namespace LDA.LdaModel
//求分母(2)
den2 += Math.Pow(int2[i], 2);
}
double cos = s / (Math.Sqrt(den1) * Math.Sqrt(den2));
Console.WriteLine(cos);
//Console.WriteLine(cos);
return s / (Math.Sqrt(den1) * Math.Sqrt(den2));
}
}
......
......@@ -28,7 +28,7 @@ namespace Njust.Pdf.Analysis.Entities
public string SourceTopic { get; set; }
[ApiMember(Description = "目标主题编号")]
public string TargeTopic { get; set; }
public string TargetTopic { get; set; }
[ApiMember(Description = "源主题词")]
public string SourceTopicWord { get; set; }
......@@ -37,6 +37,6 @@ namespace Njust.Pdf.Analysis.Entities
public string TargetTopicWord { get; set; }
[ApiMember(Description = "余弦相似度")]
public string CosSim { get; set; }
public double CosSim { get; set; }
}
}
......@@ -116,9 +116,11 @@ namespace Njust.Pdf.Analysis.Tranforms
foreach (var item in preInserts)
{
var stream = files[item.HashCode];
var exist = exists.FirstOrDefault(o => o.HashCode == item.HashCode);
if (exist != null)
{
KiviiContext.VirtualFiles.WriteFile(exist.ImportPath, stream);
rtns.Results.Add(exist);
continue;
}
......@@ -126,7 +128,6 @@ namespace Njust.Pdf.Analysis.Tranforms
conn.Insert(item);
item.RemoveAllOnlyProperties();
rtns.Results.Add(item);
var stream = files[item.HashCode];
stream.Position = 0;
KiviiContext.VirtualFiles.WriteFile(item.ImportPath, stream);
......
......@@ -10,6 +10,7 @@ using System.IO;
using System.Linq;
using System.Reflection;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
namespace Njust.Pdf.Analysis.Tranforms
......@@ -433,77 +434,31 @@ namespace Njust.Pdf.Analysis.Tranforms
foreach (var kv in group)
{
List<String> englishtext = new List<string>();
List<String> chinesetext = new List<string>();
if (kv.Key.Language.IsNullOrEmpty() || kv.Key.DomainName.IsNullOrEmpty() || kv.Key.PublishTime.IsNullOrEmpty()) continue;
var items = kv.ToList();
string[] lemmatizeText;
if (kv.Key.Language == "英文")
{
for (var i = 0; i < items.Count(); i++)
var englishtext = new List<string>();
foreach (var item in items)
{
englishtext.Add(items[i].Title + ". " + items[i].Abstract);
englishtext.Add(item.Title + ". " + item.Abstract);
}
var EnText = LemmatizeEn.lemmatizeEnglish(englishtext);
CommandLineOption ldaoption = new CommandLineOption();
ldaoption.topics = (int)Math.Ceiling(System.Math.Sqrt(EnText.Length));
double normolK = 200;
ldaoption.alpha = ldaoption.topics / normolK;
ldaoption.beta = 0.5;
ldaoption.savestep = 500;
ldaoption.niters = 500;
ldaoption.twords = 10;
try
{
LDAGibbsSampling model = new LDAGibbsSampling();
Corpora cor = new Corpora();
cor.LoadDataFile(EnText);
var lda = model.TrainNewModel(cor, ldaoption);
for (int k = 0; k < lda.K; k++)
{
var wordsProbsList = new Dictionary<int, double>();
for (int w = 0; w < lda.V; w++)
{
wordsProbsList.Add(w, lda.phi[k][w]);
lemmatizeText = LemmatizeEn.lemmatizeEnglish(englishtext);
}
var wordsProbsListOrdered = wordsProbsList.OrderBy(e => -e.Value).ToList();
string word = "";
List<double> TopicWordsProbability = new List<double>(); ;
for (int i = 0; i < 10; i++)
else if (kv.Key.Language == "中文")
{
word += cor.GetStringByID(wordsProbsListOrdered[i].Key) + "; ";
TopicWordsProbability.Add(wordsProbsListOrdered[i].Value);
}
var DisTimeTopic = new DisTimeTopic();
DisTimeTopic.Language = kv.Key.Language;
DisTimeTopic.DomainName = kv.Key.DomainName;
DisTimeTopic.Year = kv.Key.PublishTime;
DisTimeTopic.Topic = "Topic " + k.ToString();
DisTimeTopic.TopicWord = word;
DisTimeTopic.TopicWordProbability = TopicWordsProbability;
DisTimeTopic.Documentlist = lda.theta.ToList();
DisTimeTopic.Parameter = "Aplha: " + lda.alpha.ToString() + ";" + "Beta: " + lda.beta.ToString() + ";" + "文档数: " + lda.M.ToString() + ";" + "主题数: " + lda.K.ToString() + ";" + "词袋数: " + lda.V.ToString() + ";" + "迭代次数:" + ldaoption.niters.ToString() + ";" + "LogLikelihood= " + lda.LogLikelihood.ToString();
conn.Insert(DisTimeTopic);
DisTimeTopic.RemoveAllOnlyProperties();
rtns.Results.Add(DisTimeTopic);
}
}
catch (Exception ex)
var chinesetext = new List<string>();
foreach (var item in items)
{
Console.WriteLine(ex.StackTrace);
Console.WriteLine(ex.Message);
}
chinesetext.Add(item.Title + "。" + item.Abstract);
}
if (kv.Key.Language == "中文")
{
for (var i = 0; i < items.Count(); i++)
{
chinesetext.Add(items[i].Title + "。" + items[i].Abstract);
lemmatizeText = ChinesePreprocessing.ChineseWordSegmentation(chinesetext);
}
var CnText = ChinesePreprocessing.ChineseWordSegmentation(chinesetext);
else continue;
CommandLineOption ldaoption = new CommandLineOption();
ldaoption.topics = (int)Math.Ceiling(System.Math.Sqrt(CnText.Length));
ldaoption.topics = (int)Math.Ceiling(System.Math.Sqrt(lemmatizeText.Length));
double normolK = 200;
ldaoption.alpha = ldaoption.topics / normolK;
ldaoption.beta = 0.5;
......@@ -514,7 +469,7 @@ namespace Njust.Pdf.Analysis.Tranforms
{
LDAGibbsSampling model = new LDAGibbsSampling();
Corpora cor = new Corpora();
cor.LoadDataFile(CnText);
cor.LoadDataFile(lemmatizeText);
var lda = model.TrainNewModel(cor, ldaoption);
for (int k = 0; k < lda.K; k++)
{
......@@ -524,26 +479,35 @@ namespace Njust.Pdf.Analysis.Tranforms
{
wordsProbsList.Add(w, lda.phi[k][w]);
}
var wordsProbsListOrdered = wordsProbsList.OrderBy(e => -e.Value).ToList();
string word = "";
List<double> TopicWordsProbability = new List<double>(); ;
for (int i = 0; i < 10; i++)
{
word += cor.GetStringByID(wordsProbsListOrdered[i].Key) + "; ";
TopicWordsProbability.Add(wordsProbsListOrdered[i].Value);
}
var DisTimeTopic = new DisTimeTopic();
DisTimeTopic.Language = kv.Key.Language;
DisTimeTopic.DomainName = kv.Key.DomainName;
DisTimeTopic.Year = kv.Key.PublishTime;
DisTimeTopic.Topic = "Topic " + k.ToString();
DisTimeTopic.TopicWord = word;
DisTimeTopic.TopicWordProbability = TopicWordsProbability;
DisTimeTopic.Documentlist = lda.theta.ToList();
DisTimeTopic.Parameter = "Aplha: " + lda.alpha.ToString() + ";" + "Beta: " + lda.beta.ToString() + ";" + "文档数: " + lda.M.ToString() + ";" + "主题数: " + lda.K.ToString() + ";" + "词袋数: " + lda.V.ToString() + ";" + "迭代次数:" + ldaoption.niters.ToString() + ";" + "LogLikelihood= " + lda.LogLikelihood.ToString();
conn.Insert(DisTimeTopic);
DisTimeTopic.RemoveAllOnlyProperties();
rtns.Results.Add(DisTimeTopic);
var wordsProbsListOrdereds = wordsProbsList.OrderBy(e => -e.Value).ToList();
List<string> words = new List<string>();
List<double> TopicWordsProbability = new List<double>();
foreach (var wordsProbsListOrdered in wordsProbsListOrdereds)
{
if (words.Count >= 10) break;
var word = cor.GetStringByID(wordsProbsListOrdered.Key);
if (word.IsNullOrEmpty()) continue;
words.Add(cor.GetStringByID(wordsProbsListOrdered.Key));
TopicWordsProbability.Add(wordsProbsListOrdered.Value);
}
var disTimeTopic = new DisTimeTopic();
disTimeTopic.Language = kv.Key.Language;
disTimeTopic.DomainName = kv.Key.DomainName;
var year= kv.Key.PublishTime.Replace(" ", "");
//提取多个数字,该方式会分别提取字符串中的数字,如:"ABC#123@AS456测试789"就会分别提取123、456、789
var reg = new Regex("[0-9]+", RegexOptions.IgnoreCase | RegexOptions.Singleline, TimeSpan.FromSeconds(2));
var mc = reg.Matches(year);
if (mc.Count <= 0) continue;
disTimeTopic.Year = mc[0].Value;
disTimeTopic.Topic = "Topic " + k.ToString();
//DisTimeTopic.TopicWord = word;
disTimeTopic.TopicWord = string.Join(";", words);
disTimeTopic.TopicWordProbability = TopicWordsProbability;
disTimeTopic.Documentlist = lda.theta.ToList();
disTimeTopic.Parameter = "Aplha: " + lda.alpha.ToString() + ";" + "Beta: " + lda.beta.ToString() + ";" + "文档数: " + lda.M.ToString() + ";" + "主题数: " + lda.K.ToString() + ";" + "词袋数: " + lda.V.ToString() + ";" + "迭代次数:" + ldaoption.niters.ToString() + ";" + "LogLikelihood= " + lda.logLikelihood.ToString();
conn.Insert(disTimeTopic);
disTimeTopic.RemoveAllOnlyProperties();
rtns.Results.Add(disTimeTopic);
}
}
catch (Exception ex)
......@@ -552,7 +516,6 @@ namespace Njust.Pdf.Analysis.Tranforms
Console.WriteLine(ex.Message);
}
}
}
rtns.Total = rtns.Results.Count();
return rtns;
}
......@@ -566,6 +529,51 @@ namespace Njust.Pdf.Analysis.Tranforms
var rtns = new RestfulQueryResponse<DisTopicEvolution>();
rtns.Results = new List<DisTopicEvolution>();
var conn = KiviiContext.GetOpenedDbConnection<DisTimeTopic>();
var query = conn.From<DisTimeTopic>();
query.OrderBy(o => o.Year);
var allDisTimeTopics = conn.Select(query);
var group = allDisTimeTopics.GroupBy(o => new { o.Year, o.DomainName, o.Language }).ToList();
conn.InitEntityType<DisTopicEvolution>();
conn.Delete<DisTopicEvolution>(o => o.Kvid == o.Kvid);
foreach(var kv in group)
{
var index = group.IndexOf(kv);
if (index + 1 == group.Count()) continue;
foreach(var item in kv)
{
var nextGroup = group.FirstOrDefault(o => o.Key.Language == item.Language & o.Key.DomainName == item.DomainName & o.Key.Year.ToInt() > item.Year.ToInt());
if (nextGroup == null) continue;
var sourceTopicWords = item.TopicWord.Split(';').ToList();
var disTopicEvolution = new DisTopicEvolution();
disTopicEvolution.DomainName = item.DomainName;
disTopicEvolution.Language = item.Language;
disTopicEvolution.SourceYear = item.Year;
disTopicEvolution.SourceTopic = item.Topic;
disTopicEvolution.SourceTopicWord = item.TopicWord;
foreach (var next in nextGroup)
{
var targetTopicWords = next.TopicWord.Split(';').ToList();
var cosSim = TopicSimilarityCos.SimilarityCos(sourceTopicWords, targetTopicWords);
if (disTopicEvolution.CosSim <= cosSim)
{
disTopicEvolution.CosSim = cosSim;
disTopicEvolution.TargetTopic = next.Topic;
disTopicEvolution.TargetYear = next.Year;
disTopicEvolution.TargetTopicWord = next.TopicWord;
}
}
conn.Insert(disTopicEvolution);
disTopicEvolution.RemoveAllOnlyProperties();
rtns.Results.Add(disTopicEvolution);
}
}
rtns.Total = rtns.Results.Count();
return rtns;
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment