Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
N
Njust.Pdf.Analysis.V4.5
Project
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
陶然
Njust.Pdf.Analysis.V4.5
Commits
9993a834
Commit
9993a834
authored
Jun 18, 2021
by
陶然
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
init
parent
91845da6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
157 additions
and
147 deletions
+157
-147
LDA.cs
LDA/LdaModel/LDA.cs
+1
-0
LDAGibbsSampling.cs
LDA/LdaModel/LDAGibbsSampling.cs
+14
-13
TopicSimilarityCos.cs
LDA/LdaModel/TopicSimilarityCos.cs
+25
-26
DisTopicEvolution.cs
Src/Entities/DisTopicEvolution.cs
+2
-2
RestfulAnalyse.cs
Src/Tranforms/RestfulAnalyse.cs
+2
-1
RestfulDisAnalyse.cs
Src/Tranforms/RestfulDisAnalyse.cs
+113
-105
No files found.
LDA/LdaModel/LDA.cs
View file @
9993a834
...
@@ -9,6 +9,7 @@ namespace LDA.LdaModel
...
@@ -9,6 +9,7 @@ namespace LDA.LdaModel
public
int
K
;
//#Topics
public
int
K
;
//#Topics
public
double
alpha
;
// Dirichlet Prior Parameter for Document->Topic
public
double
alpha
;
// Dirichlet Prior Parameter for Document->Topic
public
double
beta
;
// Dirichlet Prior Parameter for Topic->Word
public
double
beta
;
// Dirichlet Prior Parameter for Topic->Word
public
double
logLikelihood
;
public
double
LogLikelihood
public
double
LogLikelihood
{
{
get
get
...
...
LDA/LdaModel/LDAGibbsSampling.cs
View file @
9993a834
...
@@ -130,26 +130,27 @@ namespace LDA.LdaModel
...
@@ -130,26 +130,27 @@ namespace LDA.LdaModel
getLda
.
M
=
M
;
getLda
.
M
=
M
;
getLda
.
phi
=
phi
;
getLda
.
phi
=
phi
;
getLda
.
theta
=
theta
;
getLda
.
theta
=
theta
;
getLda
.
logLikelihood
=
LogLikelihood
;
return
getLda
;
return
getLda
;
}
}
public
void
PrintModelInfo
()
public
void
PrintModelInfo
()
{
{
Console
.
WriteLine
(
"Aplha: "
+
alpha
.
ToString
());
//
Console.WriteLine("Aplha: " + alpha.ToString());
Console
.
WriteLine
(
"Beta: "
+
beta
.
ToString
());
//
Console.WriteLine("Beta: " + beta.ToString());
Console
.
WriteLine
(
"M: "
+
M
);
//
Console.WriteLine("M: " + M);
Console
.
WriteLine
(
"K: "
+
K
);
//
Console.WriteLine("K: " + K);
Console
.
WriteLine
(
"V: "
+
V
);
//
Console.WriteLine("V: " + V);
Console
.
WriteLine
(
"Total iterations:"
+
niters
);
//
Console.WriteLine("Total iterations:" + niters);
Console
.
WriteLine
(
"Save at: "
+
savestep
);
//
Console.WriteLine("Save at: " + savestep);
Console
.
WriteLine
();
//
Console.WriteLine();
}
}
private
void
GibbsSampling
(
int
totalIter
)
private
void
GibbsSampling
(
int
totalIter
)
{
{
for
(
int
iter
=
1
;
iter
<=
totalIter
;
iter
++)
for
(
int
iter
=
1
;
iter
<=
totalIter
;
iter
++)
{
{
Console
.
Write
(
"Iteration "
+
iter
+
":"
);
//
Console.Write("Iteration " + iter + ":");
var
stopWatch
=
new
Stopwatch
();
var
stopWatch
=
new
Stopwatch
();
stopWatch
.
Start
();
stopWatch
.
Start
();
for
(
int
i
=
0
;
i
<
wn
;
i
++)
for
(
int
i
=
0
;
i
<
wn
;
i
++)
...
@@ -159,7 +160,7 @@ namespace LDA.LdaModel
...
@@ -159,7 +160,7 @@ namespace LDA.LdaModel
}
}
stopWatch
.
Stop
();
stopWatch
.
Stop
();
Console
.
WriteLine
(
stopWatch
.
ElapsedMilliseconds
/
1000.0
+
" seconds"
);
//
Console.WriteLine(stopWatch.ElapsedMilliseconds / 1000.0 + " seconds");
if
(
iter
%
savestep
==
0
)
if
(
iter
%
savestep
==
0
)
{
{
//保存参数、文档分布等
//保存参数、文档分布等
...
@@ -195,7 +196,7 @@ namespace LDA.LdaModel
...
@@ -195,7 +196,7 @@ namespace LDA.LdaModel
}
}
//主题编号
//主题编号
//sw.Write("Topic " + k + "th:\n");
//sw.Write("Topic " + k + "th:\n");
Console
.
WriteLine
(
"Topic "
+
k
+
"th:\n"
);
//
Console.WriteLine("Topic " + k + "th:\n");
//主题词
//主题词
var
wordsProbsListOrdered
=
wordsProbsList
.
OrderBy
(
e
=>
-
e
.
Value
).
ToList
();
var
wordsProbsListOrdered
=
wordsProbsList
.
OrderBy
(
e
=>
-
e
.
Value
).
ToList
();
...
@@ -203,11 +204,11 @@ namespace LDA.LdaModel
...
@@ -203,11 +204,11 @@ namespace LDA.LdaModel
{
{
string
word
=
cor
.
GetStringByID
(
wordsProbsListOrdered
[
i
].
Key
);
string
word
=
cor
.
GetStringByID
(
wordsProbsListOrdered
[
i
].
Key
);
// sw.WriteLine("\t" + word + " " + wordsProbsListOrdered[i].Value);
// sw.WriteLine("\t" + word + " " + wordsProbsListOrdered[i].Value);
Console
.
WriteLine
(
"\t"
+
word
+
" "
+
wordsProbsListOrdered
[
i
].
Value
);
//
Console.WriteLine("\t" + word + " " + wordsProbsListOrdered[i].Value);
}
}
}
}
Console
.
WriteLine
(
"LogLikelihood= "
+
LogLikelihood
);
//
Console.WriteLine("LogLikelihood= " + LogLikelihood);
}
}
}
}
}
}
...
...
LDA/LdaModel/TopicSimilarityCos.cs
View file @
9993a834
...
@@ -6,32 +6,32 @@ using System.Threading.Tasks;
...
@@ -6,32 +6,32 @@ using System.Threading.Tasks;
namespace
LDA.LdaModel
namespace
LDA.LdaModel
{
{
class
TopicSimilarityCos
public
static
class
TopicSimilarityCos
{
{
public
double
SimilarityCos
(
)
public
static
double
SimilarityCos
(
List
<
string
>
lstr1
,
List
<
string
>
lstr2
)
{
{
List
<
string
>
lstr1
=
new
List
<
string
>();
//
List<string> lstr1 = new List<string>();
lstr1
.
Add
(
"稀疏"
);
//
lstr1.Add("稀疏");
lstr1
.
Add
(
"信号"
);
//
lstr1.Add("信号");
lstr1
.
Add
(
"算法"
);
//
lstr1.Add("算法");
lstr1
.
Add
(
"方法"
);
//
lstr1.Add("方法");
lstr1
.
Add
(
"轴承"
);
//
lstr1.Add("轴承");
lstr1
.
Add
(
"故障"
);
//
lstr1.Add("故障");
lstr1
.
Add
(
"贝叶斯"
);
//
lstr1.Add("贝叶斯");
lstr1
.
Add
(
"模型"
);
//
lstr1.Add("模型");
lstr1
.
Add
(
"发动机"
);
//
lstr1.Add("发动机");
lstr1
.
Add
(
"降低"
);
//
lstr1.Add("降低");
List
<
string
>
lstr2
=
new
List
<
string
>();
//
List<string> lstr2 = new List<string>();
lstr2
.
Add
(
"测量"
);
//
lstr2.Add("测量");
lstr2
.
Add
(
"发动机"
);
//
lstr2.Add("发动机");
lstr2
.
Add
(
"叶尖"
);
//
lstr2.Add("叶尖");
lstr2
.
Add
(
"间隙"
);
//
lstr2.Add("间隙");
lstr2
.
Add
(
"性能"
);
//
lstr2.Add("性能");
lstr2
.
Add
(
"控制"
);
//
lstr2.Add("控制");
lstr2
.
Add
(
"微波"
);
//
lstr2.Add("微波");
lstr2
.
Add
(
"叶片"
);
//
lstr2.Add("叶片");
lstr2
.
Add
(
"降低"
);
//
lstr2.Add("降低");
lstr2
.
Add
(
"自主"
);
//
lstr2.Add("自主");
//求并集
//求并集
var
strUnion
=
lstr1
.
Union
(
lstr2
);
var
strUnion
=
lstr1
.
Union
(
lstr2
);
...
@@ -58,8 +58,7 @@ namespace LDA.LdaModel
...
@@ -58,8 +58,7 @@ namespace LDA.LdaModel
//求分母(2)
//求分母(2)
den2
+=
Math
.
Pow
(
int2
[
i
],
2
);
den2
+=
Math
.
Pow
(
int2
[
i
],
2
);
}
}
double
cos
=
s
/
(
Math
.
Sqrt
(
den1
)
*
Math
.
Sqrt
(
den2
));
//Console.WriteLine(cos);
Console
.
WriteLine
(
cos
);
return
s
/
(
Math
.
Sqrt
(
den1
)
*
Math
.
Sqrt
(
den2
));
return
s
/
(
Math
.
Sqrt
(
den1
)
*
Math
.
Sqrt
(
den2
));
}
}
}
}
...
...
Src/Entities/DisTopicEvolution.cs
View file @
9993a834
...
@@ -28,7 +28,7 @@ namespace Njust.Pdf.Analysis.Entities
...
@@ -28,7 +28,7 @@ namespace Njust.Pdf.Analysis.Entities
public
string
SourceTopic
{
get
;
set
;
}
public
string
SourceTopic
{
get
;
set
;
}
[
ApiMember
(
Description
=
"目标主题编号"
)]
[
ApiMember
(
Description
=
"目标主题编号"
)]
public
string
TargeTopic
{
get
;
set
;
}
public
string
Targe
t
Topic
{
get
;
set
;
}
[
ApiMember
(
Description
=
"源主题词"
)]
[
ApiMember
(
Description
=
"源主题词"
)]
public
string
SourceTopicWord
{
get
;
set
;
}
public
string
SourceTopicWord
{
get
;
set
;
}
...
@@ -37,6 +37,6 @@ namespace Njust.Pdf.Analysis.Entities
...
@@ -37,6 +37,6 @@ namespace Njust.Pdf.Analysis.Entities
public
string
TargetTopicWord
{
get
;
set
;
}
public
string
TargetTopicWord
{
get
;
set
;
}
[
ApiMember
(
Description
=
"余弦相似度"
)]
[
ApiMember
(
Description
=
"余弦相似度"
)]
public
string
CosSim
{
get
;
set
;
}
public
double
CosSim
{
get
;
set
;
}
}
}
}
}
Src/Tranforms/RestfulAnalyse.cs
View file @
9993a834
...
@@ -116,9 +116,11 @@ namespace Njust.Pdf.Analysis.Tranforms
...
@@ -116,9 +116,11 @@ namespace Njust.Pdf.Analysis.Tranforms
foreach
(
var
item
in
preInserts
)
foreach
(
var
item
in
preInserts
)
{
{
var
stream
=
files
[
item
.
HashCode
];
var
exist
=
exists
.
FirstOrDefault
(
o
=>
o
.
HashCode
==
item
.
HashCode
);
var
exist
=
exists
.
FirstOrDefault
(
o
=>
o
.
HashCode
==
item
.
HashCode
);
if
(
exist
!=
null
)
if
(
exist
!=
null
)
{
{
KiviiContext
.
VirtualFiles
.
WriteFile
(
exist
.
ImportPath
,
stream
);
rtns
.
Results
.
Add
(
exist
);
rtns
.
Results
.
Add
(
exist
);
continue
;
continue
;
}
}
...
@@ -126,7 +128,6 @@ namespace Njust.Pdf.Analysis.Tranforms
...
@@ -126,7 +128,6 @@ namespace Njust.Pdf.Analysis.Tranforms
conn
.
Insert
(
item
);
conn
.
Insert
(
item
);
item
.
RemoveAllOnlyProperties
();
item
.
RemoveAllOnlyProperties
();
rtns
.
Results
.
Add
(
item
);
rtns
.
Results
.
Add
(
item
);
var
stream
=
files
[
item
.
HashCode
];
stream
.
Position
=
0
;
stream
.
Position
=
0
;
KiviiContext
.
VirtualFiles
.
WriteFile
(
item
.
ImportPath
,
stream
);
KiviiContext
.
VirtualFiles
.
WriteFile
(
item
.
ImportPath
,
stream
);
...
...
Src/Tranforms/RestfulDisAnalyse.cs
View file @
9993a834
...
@@ -10,6 +10,7 @@ using System.IO;
...
@@ -10,6 +10,7 @@ using System.IO;
using
System.Linq
;
using
System.Linq
;
using
System.Reflection
;
using
System.Reflection
;
using
System.Text
;
using
System.Text
;
using
System.Text.RegularExpressions
;
using
System.Threading.Tasks
;
using
System.Threading.Tasks
;
namespace
Njust.Pdf.Analysis.Tranforms
namespace
Njust.Pdf.Analysis.Tranforms
...
@@ -433,125 +434,87 @@ namespace Njust.Pdf.Analysis.Tranforms
...
@@ -433,125 +434,87 @@ namespace Njust.Pdf.Analysis.Tranforms
foreach
(
var
kv
in
group
)
foreach
(
var
kv
in
group
)
{
{
List
<
String
>
englishtext
=
new
List
<
string
>();
if
(
kv
.
Key
.
Language
.
IsNullOrEmpty
()
||
kv
.
Key
.
DomainName
.
IsNullOrEmpty
()
||
kv
.
Key
.
PublishTime
.
IsNullOrEmpty
())
continue
;
List
<
String
>
chinesetext
=
new
List
<
string
>();
var
items
=
kv
.
ToList
();
var
items
=
kv
.
ToList
();
string
[]
lemmatizeText
;
if
(
kv
.
Key
.
Language
==
"英文"
)
if
(
kv
.
Key
.
Language
==
"英文"
)
{
{
for
(
var
i
=
0
;
i
<
items
.
Count
();
i
++)
var
englishtext
=
new
List
<
string
>();
{
foreach
(
var
item
in
items
)
englishtext
.
Add
(
items
[
i
].
Title
+
". "
+
items
[
i
].
Abstract
);
}
var
EnText
=
LemmatizeEn
.
lemmatizeEnglish
(
englishtext
);
CommandLineOption
ldaoption
=
new
CommandLineOption
();
ldaoption
.
topics
=
(
int
)
Math
.
Ceiling
(
System
.
Math
.
Sqrt
(
EnText
.
Length
));
double
normolK
=
200
;
ldaoption
.
alpha
=
ldaoption
.
topics
/
normolK
;
ldaoption
.
beta
=
0.5
;
ldaoption
.
savestep
=
500
;
ldaoption
.
niters
=
500
;
ldaoption
.
twords
=
10
;
try
{
LDAGibbsSampling
model
=
new
LDAGibbsSampling
();
Corpora
cor
=
new
Corpora
();
cor
.
LoadDataFile
(
EnText
);
var
lda
=
model
.
TrainNewModel
(
cor
,
ldaoption
);
for
(
int
k
=
0
;
k
<
lda
.
K
;
k
++)
{
var
wordsProbsList
=
new
Dictionary
<
int
,
double
>();
for
(
int
w
=
0
;
w
<
lda
.
V
;
w
++)
{
wordsProbsList
.
Add
(
w
,
lda
.
phi
[
k
][
w
]);
}
var
wordsProbsListOrdered
=
wordsProbsList
.
OrderBy
(
e
=>
-
e
.
Value
).
ToList
();
string
word
=
""
;
List
<
double
>
TopicWordsProbability
=
new
List
<
double
>();
;
for
(
int
i
=
0
;
i
<
10
;
i
++)
{
word
+=
cor
.
GetStringByID
(
wordsProbsListOrdered
[
i
].
Key
)
+
"; "
;
TopicWordsProbability
.
Add
(
wordsProbsListOrdered
[
i
].
Value
);
}
var
DisTimeTopic
=
new
DisTimeTopic
();
DisTimeTopic
.
Language
=
kv
.
Key
.
Language
;
DisTimeTopic
.
DomainName
=
kv
.
Key
.
DomainName
;
DisTimeTopic
.
Year
=
kv
.
Key
.
PublishTime
;
DisTimeTopic
.
Topic
=
"Topic "
+
k
.
ToString
();
DisTimeTopic
.
TopicWord
=
word
;
DisTimeTopic
.
TopicWordProbability
=
TopicWordsProbability
;
DisTimeTopic
.
Documentlist
=
lda
.
theta
.
ToList
();
DisTimeTopic
.
Parameter
=
"Aplha: "
+
lda
.
alpha
.
ToString
()
+
";"
+
"Beta: "
+
lda
.
beta
.
ToString
()
+
";"
+
"文档数: "
+
lda
.
M
.
ToString
()
+
";"
+
"主题数: "
+
lda
.
K
.
ToString
()
+
";"
+
"词袋数: "
+
lda
.
V
.
ToString
()
+
";"
+
"迭代次数:"
+
ldaoption
.
niters
.
ToString
()
+
";"
+
"LogLikelihood= "
+
lda
.
LogLikelihood
.
ToString
();
conn
.
Insert
(
DisTimeTopic
);
DisTimeTopic
.
RemoveAllOnlyProperties
();
rtns
.
Results
.
Add
(
DisTimeTopic
);
}
}
catch
(
Exception
ex
)
{
{
Console
.
WriteLine
(
ex
.
StackTrace
);
englishtext
.
Add
(
item
.
Title
+
". "
+
item
.
Abstract
);
Console
.
WriteLine
(
ex
.
Message
);
}
}
lemmatizeText
=
LemmatizeEn
.
lemmatizeEnglish
(
englishtext
);
}
}
if
(
kv
.
Key
.
Language
==
"中文"
)
else
if
(
kv
.
Key
.
Language
==
"中文"
)
{
{
for
(
var
i
=
0
;
i
<
items
.
Count
();
i
++)
var
chinesetext
=
new
List
<
string
>();
foreach
(
var
item
in
items
)
{
{
chinesetext
.
Add
(
item
s
[
i
].
Title
+
"。"
+
items
[
i
]
.
Abstract
);
chinesetext
.
Add
(
item
.
Title
+
"。"
+
item
.
Abstract
);
}
}
var
CnText
=
ChinesePreprocessing
.
ChineseWordSegmentation
(
chinesetext
);
lemmatizeText
=
ChinesePreprocessing
.
ChineseWordSegmentation
(
chinesetext
);
CommandLineOption
ldaoption
=
new
CommandLineOption
();
}
ldaoption
.
topics
=
(
int
)
Math
.
Ceiling
(
System
.
Math
.
Sqrt
(
CnText
.
Length
));
else
continue
;
double
normolK
=
200
;
ldaoption
.
alpha
=
ldaoption
.
topics
/
normolK
;
CommandLineOption
ldaoption
=
new
CommandLineOption
();
ldaoption
.
beta
=
0.5
;
ldaoption
.
topics
=
(
int
)
Math
.
Ceiling
(
System
.
Math
.
Sqrt
(
lemmatizeText
.
Length
));
ldaoption
.
savestep
=
500
;
double
normolK
=
200
;
ldaoption
.
niters
=
500
;
ldaoption
.
alpha
=
ldaoption
.
topics
/
normolK
;
ldaoption
.
twords
=
10
;
ldaoption
.
beta
=
0.5
;
try
ldaoption
.
savestep
=
500
;
ldaoption
.
niters
=
500
;
ldaoption
.
twords
=
10
;
try
{
LDAGibbsSampling
model
=
new
LDAGibbsSampling
();
Corpora
cor
=
new
Corpora
();
cor
.
LoadDataFile
(
lemmatizeText
);
var
lda
=
model
.
TrainNewModel
(
cor
,
ldaoption
);
for
(
int
k
=
0
;
k
<
lda
.
K
;
k
++)
{
{
LDAGibbsSampling
model
=
new
LDAGibbsSampling
();
var
wordsProbsList
=
new
Dictionary
<
int
,
double
>();
Corpora
cor
=
new
Corpora
();
cor
.
LoadDataFile
(
CnText
);
for
(
int
w
=
0
;
w
<
lda
.
V
;
w
++)
var
lda
=
model
.
TrainNewModel
(
cor
,
ldaoption
);
for
(
int
k
=
0
;
k
<
lda
.
K
;
k
++)
{
{
var
wordsProbsList
=
new
Dictionary
<
int
,
double
>();
wordsProbsList
.
Add
(
w
,
lda
.
phi
[
k
][
w
]);
for
(
int
w
=
0
;
w
<
lda
.
V
;
w
++)
{
wordsProbsList
.
Add
(
w
,
lda
.
phi
[
k
][
w
]);
}
var
wordsProbsListOrdered
=
wordsProbsList
.
OrderBy
(
e
=>
-
e
.
Value
).
ToList
();
string
word
=
""
;
List
<
double
>
TopicWordsProbability
=
new
List
<
double
>();
;
for
(
int
i
=
0
;
i
<
10
;
i
++)
{
word
+=
cor
.
GetStringByID
(
wordsProbsListOrdered
[
i
].
Key
)
+
"; "
;
TopicWordsProbability
.
Add
(
wordsProbsListOrdered
[
i
].
Value
);
}
var
DisTimeTopic
=
new
DisTimeTopic
();
DisTimeTopic
.
Language
=
kv
.
Key
.
Language
;
DisTimeTopic
.
DomainName
=
kv
.
Key
.
DomainName
;
DisTimeTopic
.
Year
=
kv
.
Key
.
PublishTime
;
DisTimeTopic
.
Topic
=
"Topic "
+
k
.
ToString
();
DisTimeTopic
.
TopicWord
=
word
;
DisTimeTopic
.
TopicWordProbability
=
TopicWordsProbability
;
DisTimeTopic
.
Documentlist
=
lda
.
theta
.
ToList
();
DisTimeTopic
.
Parameter
=
"Aplha: "
+
lda
.
alpha
.
ToString
()
+
";"
+
"Beta: "
+
lda
.
beta
.
ToString
()
+
";"
+
"文档数: "
+
lda
.
M
.
ToString
()
+
";"
+
"主题数: "
+
lda
.
K
.
ToString
()
+
";"
+
"词袋数: "
+
lda
.
V
.
ToString
()
+
";"
+
"迭代次数:"
+
ldaoption
.
niters
.
ToString
()
+
";"
+
"LogLikelihood= "
+
lda
.
LogLikelihood
.
ToString
();
conn
.
Insert
(
DisTimeTopic
);
DisTimeTopic
.
RemoveAllOnlyProperties
();
rtns
.
Results
.
Add
(
DisTimeTopic
);
}
}
}
var
wordsProbsListOrdereds
=
wordsProbsList
.
OrderBy
(
e
=>
-
e
.
Value
).
ToList
();
catch
(
Exception
ex
)
List
<
string
>
words
=
new
List
<
string
>();
{
List
<
double
>
TopicWordsProbability
=
new
List
<
double
>();
Console
.
WriteLine
(
ex
.
StackTrace
);
foreach
(
var
wordsProbsListOrdered
in
wordsProbsListOrdereds
)
Console
.
WriteLine
(
ex
.
Message
);
{
if
(
words
.
Count
>=
10
)
break
;
var
word
=
cor
.
GetStringByID
(
wordsProbsListOrdered
.
Key
);
if
(
word
.
IsNullOrEmpty
())
continue
;
words
.
Add
(
cor
.
GetStringByID
(
wordsProbsListOrdered
.
Key
));
TopicWordsProbability
.
Add
(
wordsProbsListOrdered
.
Value
);
}
var
disTimeTopic
=
new
DisTimeTopic
();
disTimeTopic
.
Language
=
kv
.
Key
.
Language
;
disTimeTopic
.
DomainName
=
kv
.
Key
.
DomainName
;
var
year
=
kv
.
Key
.
PublishTime
.
Replace
(
" "
,
""
);
//提取多个数字,该方式会分别提取字符串中的数字,如:"ABC#123@AS456测试789"就会分别提取123、456、789
var
reg
=
new
Regex
(
"[0-9]+"
,
RegexOptions
.
IgnoreCase
|
RegexOptions
.
Singleline
,
TimeSpan
.
FromSeconds
(
2
));
var
mc
=
reg
.
Matches
(
year
);
if
(
mc
.
Count
<=
0
)
continue
;
disTimeTopic
.
Year
=
mc
[
0
].
Value
;
disTimeTopic
.
Topic
=
"Topic "
+
k
.
ToString
();
//DisTimeTopic.TopicWord = word;
disTimeTopic
.
TopicWord
=
string
.
Join
(
";"
,
words
);
disTimeTopic
.
TopicWordProbability
=
TopicWordsProbability
;
disTimeTopic
.
Documentlist
=
lda
.
theta
.
ToList
();
disTimeTopic
.
Parameter
=
"Aplha: "
+
lda
.
alpha
.
ToString
()
+
";"
+
"Beta: "
+
lda
.
beta
.
ToString
()
+
";"
+
"文档数: "
+
lda
.
M
.
ToString
()
+
";"
+
"主题数: "
+
lda
.
K
.
ToString
()
+
";"
+
"词袋数: "
+
lda
.
V
.
ToString
()
+
";"
+
"迭代次数:"
+
ldaoption
.
niters
.
ToString
()
+
";"
+
"LogLikelihood= "
+
lda
.
logLikelihood
.
ToString
();
conn
.
Insert
(
disTimeTopic
);
disTimeTopic
.
RemoveAllOnlyProperties
();
rtns
.
Results
.
Add
(
disTimeTopic
);
}
}
}
}
catch
(
Exception
ex
)
{
Console
.
WriteLine
(
ex
.
StackTrace
);
Console
.
WriteLine
(
ex
.
Message
);
}
}
}
rtns
.
Total
=
rtns
.
Results
.
Count
();
rtns
.
Total
=
rtns
.
Results
.
Count
();
return
rtns
;
return
rtns
;
...
@@ -566,6 +529,51 @@ namespace Njust.Pdf.Analysis.Tranforms
...
@@ -566,6 +529,51 @@ namespace Njust.Pdf.Analysis.Tranforms
var
rtns
=
new
RestfulQueryResponse
<
DisTopicEvolution
>();
var
rtns
=
new
RestfulQueryResponse
<
DisTopicEvolution
>();
rtns
.
Results
=
new
List
<
DisTopicEvolution
>();
rtns
.
Results
=
new
List
<
DisTopicEvolution
>();
var
conn
=
KiviiContext
.
GetOpenedDbConnection
<
DisTimeTopic
>();
var
query
=
conn
.
From
<
DisTimeTopic
>();
query
.
OrderBy
(
o
=>
o
.
Year
);
var
allDisTimeTopics
=
conn
.
Select
(
query
);
var
group
=
allDisTimeTopics
.
GroupBy
(
o
=>
new
{
o
.
Year
,
o
.
DomainName
,
o
.
Language
}).
ToList
();
conn
.
InitEntityType
<
DisTopicEvolution
>();
conn
.
Delete
<
DisTopicEvolution
>(
o
=>
o
.
Kvid
==
o
.
Kvid
);
foreach
(
var
kv
in
group
)
{
var
index
=
group
.
IndexOf
(
kv
);
if
(
index
+
1
==
group
.
Count
())
continue
;
foreach
(
var
item
in
kv
)
{
var
nextGroup
=
group
.
FirstOrDefault
(
o
=>
o
.
Key
.
Language
==
item
.
Language
&
o
.
Key
.
DomainName
==
item
.
DomainName
&
o
.
Key
.
Year
.
ToInt
()
>
item
.
Year
.
ToInt
());
if
(
nextGroup
==
null
)
continue
;
var
sourceTopicWords
=
item
.
TopicWord
.
Split
(
';'
).
ToList
();
var
disTopicEvolution
=
new
DisTopicEvolution
();
disTopicEvolution
.
DomainName
=
item
.
DomainName
;
disTopicEvolution
.
Language
=
item
.
Language
;
disTopicEvolution
.
SourceYear
=
item
.
Year
;
disTopicEvolution
.
SourceTopic
=
item
.
Topic
;
disTopicEvolution
.
SourceTopicWord
=
item
.
TopicWord
;
foreach
(
var
next
in
nextGroup
)
{
var
targetTopicWords
=
next
.
TopicWord
.
Split
(
';'
).
ToList
();
var
cosSim
=
TopicSimilarityCos
.
SimilarityCos
(
sourceTopicWords
,
targetTopicWords
);
if
(
disTopicEvolution
.
CosSim
<=
cosSim
)
{
disTopicEvolution
.
CosSim
=
cosSim
;
disTopicEvolution
.
TargetTopic
=
next
.
Topic
;
disTopicEvolution
.
TargetYear
=
next
.
Year
;
disTopicEvolution
.
TargetTopicWord
=
next
.
TopicWord
;
}
}
conn
.
Insert
(
disTopicEvolution
);
disTopicEvolution
.
RemoveAllOnlyProperties
();
rtns
.
Results
.
Add
(
disTopicEvolution
);
}
}
rtns
.
Total
=
rtns
.
Results
.
Count
();
return
rtns
;
return
rtns
;
}
}
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment