Goran Glavaš
commited on
Commit
·
cf27868
1
Parent(s):
f6fa5b0
Code, binary, data, and README
Browse files- README.txt +60 -0
- binary/graphseg.jar +3 -0
- data/manifestos-gold-segmented/61320_200411.txt +0 -0
- data/manifestos-gold-segmented/61320_200811.txt +0 -0
- data/manifestos-gold-segmented/61320_201211.txt +0 -0
- data/manifestos-gold-segmented/61620_200411.txt +0 -0
- data/manifestos-gold-segmented/61620_200811.txt +0 -0
- data/manifestos-gold-segmented/61620_201211.txt +0 -0
- data/manifestos-original-clean/61320_200411.txt +0 -0
- data/manifestos-original-clean/61320_200811.txt +0 -0
- data/manifestos-original-clean/61320_201211.txt +0 -0
- data/manifestos-original-clean/61620_200411.txt +0 -0
- data/manifestos-original-clean/61620_200811.txt +0 -0
- data/manifestos-original-clean/61620_201211.txt +0 -0
- source/pom.xml +85 -0
- source/src/config.properties +3 -0
- source/src/edu/uma/nlp/graphseg/ClusteringHandler.java +206 -0
- source/src/edu/uma/nlp/graphseg/GraphHandler.java +134 -0
- source/src/edu/uma/nlp/graphseg/IOHandler.java +33 -0
- source/src/edu/uma/nlp/graphseg/STSHandler.java +37 -0
- source/src/edu/uma/nlp/graphseg/Start.java +122 -0
- source/src/edu/uma/nlp/graphseg/preprocessing/Annotation.java +36 -0
- source/src/edu/uma/nlp/graphseg/preprocessing/AnnotationType.java +14 -0
- source/src/edu/uma/nlp/graphseg/preprocessing/AnnotatorChain.java +35 -0
- source/src/edu/uma/nlp/graphseg/preprocessing/AnnotatorType.java +11 -0
- source/src/edu/uma/nlp/graphseg/preprocessing/Document.java +110 -0
- source/src/edu/uma/nlp/graphseg/preprocessing/IAnnotator.java +9 -0
- source/src/edu/uma/nlp/graphseg/preprocessing/NamedEntityAnnotation.java +88 -0
- source/src/edu/uma/nlp/graphseg/preprocessing/NamedEntityTokenAnnotation.java +38 -0
- source/src/edu/uma/nlp/graphseg/preprocessing/NamedEntityType.java +18 -0
- source/src/edu/uma/nlp/graphseg/preprocessing/PartOfSpeechAnnotation.java +69 -0
- source/src/edu/uma/nlp/graphseg/preprocessing/SentenceAnnotation.java +66 -0
- source/src/edu/uma/nlp/graphseg/preprocessing/StanfordAnnotator.java +142 -0
- source/src/edu/uma/nlp/graphseg/preprocessing/TokenAnnotation.java +104 -0
- source/src/edu/uma/nlp/graphseg/semantics/InformationContent.java +77 -0
- source/src/edu/uma/nlp/graphseg/semantics/SemanticSimilarity.java +252 -0
- source/src/edu/uma/nlp/graphseg/semantics/WordVectorSpace.java +151 -0
- source/src/edu/uma/nlp/graphseg/utils/ApplicationConfiguration.java +49 -0
- source/src/edu/uma/nlp/graphseg/utils/IOHelper.java +385 -0
- source/src/edu/uma/nlp/graphseg/utils/MemoryStorage.java +26 -0
- source/src/edu/uma/nlp/graphseg/utils/VectorOperations.java +45 -0
README.txt
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
About
|
2 |
+
========
|
3 |
+
|
4 |
+
GraphSeg is a tool for semantic/topical segmentation of text that employs semantic relatedness and a graph-based algorithm to identify semantically coherent segments in text.
|
5 |
+
Segmentation is performed at the sentence level (no intra-sentential segment beginnings/end)
|
6 |
+
|
7 |
+
Content
|
8 |
+
========
|
9 |
+
|
10 |
+
This repository contains:
|
11 |
+
|
12 |
+
(1) the Java source code (as Maven project)
|
13 |
+
(2) the ready-to-use binary version of the tool (graphseg.jar in the /binary folder)
|
14 |
+
(3) the dataset of political manifestos manually annotated with segments (used for evaluation in the research paper the GraphSeg tool accompanies).
|
15 |
+
|
16 |
+
Usage
|
17 |
+
========
|
18 |
+
|
19 |
+
The following command with four arguments runs the GraphSeg tool:
|
20 |
+
|
21 |
+
java -jar graphseg.jar <input-folder-path> <output-folder-path> <relatedness-treshold> <minimal-segment-size>
|
22 |
+
|
23 |
+
The argument (all mandatory) to be provided are:
|
24 |
+
|
25 |
+
(1) <input-folder-path> is the path to the folder (directory) containing the raw text documents that need to be topically/semantically segmented;
|
26 |
+
(2) <output-folder-path> is the path to the folder in which the semantically/topically segmented input documents are to be stored;
|
27 |
+
(3) <relatedness-treshold> is the value of the relatedness treshold (decimal number) to be used in the construction of the relatedness graph: larger values will give large number of smalled segments, whereas the smaller treshold values will provide a smaller number of coarse segments;
|
28 |
+
(4) <minimal-segment-size> defines the minimal segment size m (in number of sentences). This means that GraphSeg will not produce segments containing less than m sentences.
|
29 |
+
|
30 |
+
Example command:
|
31 |
+
|
32 |
+
java -jar graphseg.jar /home/seg-input /home/seg-output 0.25 3
|
33 |
+
|
34 |
+
Credit
|
35 |
+
========
|
36 |
+
|
37 |
+
In case you use GraphSeg in your research, please give approproate credit to our work by citing the following publication:
|
38 |
+
|
39 |
+
@InProceedings{glavavs-nanni-ponzetto:2016:*SEM,
|
40 |
+
author = {Glava\v{s}, Goran and Nanni, Federico and Ponzetto, Simone Paolo},
|
41 |
+
title = {Unsupervised Text Segmentation Using Semantic Relatedness Graphs},
|
42 |
+
booktitle = {Proceedings of the Fifth Joint Conference on Lexical and Computational Semantics},
|
43 |
+
month = {August},
|
44 |
+
year = {2016},
|
45 |
+
address = {Berlin, Germany},
|
46 |
+
publisher = {Association for Computational Linguistics},
|
47 |
+
pages = {125--130},
|
48 |
+
url = {http://anthology.aclweb.org/S16-2016}
|
49 |
+
}
|
50 |
+
|
51 |
+
Contact
|
52 |
+
========
|
53 |
+
|
54 |
+
Please address all questions about the GraphSeg tool and the *SEM publication to:
|
55 |
+
|
56 |
+
Dr. Goran Glava�
|
57 |
+
Data and Web Science Group
|
58 |
+
University of Mannheim
|
59 |
+
|
60 |
+
Email: [email protected]
|
binary/graphseg.jar
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:83ac4ce85663bd97072a2fad76349bf923d1869b7acd7a67f797e6c16a1a47b2
|
3 |
+
size 350762888
|
data/manifestos-gold-segmented/61320_200411.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/manifestos-gold-segmented/61320_200811.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/manifestos-gold-segmented/61320_201211.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/manifestos-gold-segmented/61620_200411.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/manifestos-gold-segmented/61620_200811.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/manifestos-gold-segmented/61620_201211.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/manifestos-original-clean/61320_200411.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/manifestos-original-clean/61320_200811.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/manifestos-original-clean/61320_201211.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/manifestos-original-clean/61620_200411.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/manifestos-original-clean/61620_200811.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/manifestos-original-clean/61620_201211.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
source/pom.xml
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
2 |
+
<modelVersion>4.0.0</modelVersion>
|
3 |
+
<groupId>edu.uma.nlp.graphseg</groupId>
|
4 |
+
<artifactId>graphseg</artifactId>
|
5 |
+
<version>0.0.1-SNAPSHOT</version>
|
6 |
+
<name>graphseg</name>
|
7 |
+
<description>Textual segmentation using graph-based algorithm using semantic relatedness</description>
|
8 |
+
<build>
|
9 |
+
<sourceDirectory>src</sourceDirectory>
|
10 |
+
<resources>
|
11 |
+
<resource>
|
12 |
+
<directory>src</directory>
|
13 |
+
<excludes>
|
14 |
+
<exclude>**/*.java</exclude>
|
15 |
+
</excludes>
|
16 |
+
</resource>
|
17 |
+
</resources>
|
18 |
+
<plugins>
|
19 |
+
<plugin>
|
20 |
+
<artifactId>maven-compiler-plugin</artifactId>
|
21 |
+
<version>3.3</version>
|
22 |
+
<configuration>
|
23 |
+
<source>1.8</source>
|
24 |
+
<target>1.8</target>
|
25 |
+
</configuration>
|
26 |
+
</plugin>
|
27 |
+
<plugin>
|
28 |
+
<artifactId>maven-assembly-plugin</artifactId>
|
29 |
+
<configuration>
|
30 |
+
<archive>
|
31 |
+
<manifest>
|
32 |
+
<mainClass>edu.uma.nlp.graphseg.Start</mainClass>
|
33 |
+
</manifest>
|
34 |
+
</archive>
|
35 |
+
<descriptorRefs>
|
36 |
+
<descriptorRef>jar-with-dependencies</descriptorRef>
|
37 |
+
</descriptorRefs>
|
38 |
+
</configuration>
|
39 |
+
<executions>
|
40 |
+
<execution>
|
41 |
+
<id>make-assembly</id>
|
42 |
+
<!-- bind to the packaging phase -->
|
43 |
+
<phase>package</phase>
|
44 |
+
<goals>
|
45 |
+
<goal>single</goal>
|
46 |
+
</goals>
|
47 |
+
</execution>
|
48 |
+
</executions>
|
49 |
+
</plugin>
|
50 |
+
</plugins>
|
51 |
+
</build>
|
52 |
+
<dependencies>
|
53 |
+
<dependency>
|
54 |
+
<groupId>org.jgrapht</groupId>
|
55 |
+
<artifactId>jgrapht-core</artifactId>
|
56 |
+
<version>0.9.1</version>
|
57 |
+
</dependency>
|
58 |
+
<dependency>
|
59 |
+
<groupId>org.javatuples</groupId>
|
60 |
+
<artifactId>javatuples</artifactId>
|
61 |
+
<version>1.2</version>
|
62 |
+
</dependency>
|
63 |
+
<dependency>
|
64 |
+
<groupId>commons-io</groupId>
|
65 |
+
<artifactId>commons-io</artifactId>
|
66 |
+
<version>2.4</version>
|
67 |
+
</dependency>
|
68 |
+
<dependency>
|
69 |
+
<groupId>org.apache.commons</groupId>
|
70 |
+
<artifactId>commons-lang3</artifactId>
|
71 |
+
<version>3.4</version>
|
72 |
+
</dependency>
|
73 |
+
<dependency>
|
74 |
+
<groupId>edu.stanford.nlp</groupId>
|
75 |
+
<artifactId>stanford-corenlp</artifactId>
|
76 |
+
<version>3.5.2</version>
|
77 |
+
</dependency>
|
78 |
+
<dependency>
|
79 |
+
<groupId>edu.stanford.nlp</groupId>
|
80 |
+
<artifactId>stanford-corenlp</artifactId>
|
81 |
+
<version>3.5.2</version>
|
82 |
+
<classifier>models</classifier>
|
83 |
+
</dependency>
|
84 |
+
</dependencies>
|
85 |
+
</project>
|
source/src/config.properties
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
inf-cont-path=C:/Goran/Corpora/unigram-freqs-english.txt
|
2 |
+
word-vec-path=C:/Goran/Corpora/WordVectors/glove-vectors-6b-200d.txt
|
3 |
+
stop-words-path=C:/Goran/Corpora/stopwords.txt
|
source/src/edu/uma/nlp/graphseg/ClusteringHandler.java
ADDED
@@ -0,0 +1,206 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
package edu.uma.nlp.graphseg;
|
2 |
+
|
3 |
+
import java.util.ArrayList;
|
4 |
+
import java.util.List;
|
5 |
+
import java.util.Map;
|
6 |
+
import java.util.Optional;
|
7 |
+
import java.util.stream.Collectors;
|
8 |
+
|
9 |
+
public class ClusteringHandler {
|
10 |
+
|
11 |
+
public List<List<Integer>> getSequentialClusters(List<List<Integer>> cliques, Map<Integer, Map<Integer, Double>> allSimilarities, int largestTooSmallClusterSize)
|
12 |
+
{
|
13 |
+
List<List<Integer>> sequentialClusters = new ArrayList<List<Integer>>();
|
14 |
+
|
15 |
+
System.out.println("Merging cliques...");
|
16 |
+
mergeCliques(cliques, sequentialClusters);
|
17 |
+
System.out.println("Merging singletons...");
|
18 |
+
mergeSingletons(cliques, sequentialClusters, allSimilarities);
|
19 |
+
System.out.println("Merging too small sequences...");
|
20 |
+
mergeTooSmallSequences(sequentialClusters, allSimilarities, largestTooSmallClusterSize);
|
21 |
+
|
22 |
+
return sequentialClusters;
|
23 |
+
}
|
24 |
+
|
25 |
+
private void mergeCliques(List<List<Integer>> cliques, List<List<Integer>> sequentialClusters)
|
26 |
+
{
|
27 |
+
boolean change = true;
|
28 |
+
while(change)
|
29 |
+
{
|
30 |
+
change = false;
|
31 |
+
for(List<Integer> clique : cliques)
|
32 |
+
{
|
33 |
+
for(int i = 0; i < clique.size() - 1; i++)
|
34 |
+
{
|
35 |
+
for(int j = i+1; j < clique.size(); j++)
|
36 |
+
{
|
37 |
+
int ind = i;
|
38 |
+
int jond = j;
|
39 |
+
Optional<List<Integer>> existingClusterFirst = sequentialClusters.stream().filter(sc -> sc.contains(clique.get(ind))).findFirst();
|
40 |
+
Optional<List<Integer>> existingClusterSecond = sequentialClusters.stream().filter(sc -> sc.contains(clique.get(jond))).findFirst();
|
41 |
+
|
42 |
+
// Both nodes from the clique already placed in clusters
|
43 |
+
if (existingClusterFirst.isPresent() && existingClusterSecond.isPresent())
|
44 |
+
{
|
45 |
+
continue;
|
46 |
+
}
|
47 |
+
|
48 |
+
// Neither of the nodes is in the cluster
|
49 |
+
else if (!existingClusterFirst.isPresent() && !existingClusterSecond.isPresent())
|
50 |
+
{
|
51 |
+
// if these are consecutive sentences, we make a new cluster
|
52 |
+
if (Math.abs(clique.get(i) - clique.get(j)) == 1)
|
53 |
+
{
|
54 |
+
List<Integer> newCluster = new ArrayList<Integer>();
|
55 |
+
newCluster.add(Math.min(clique.get(i), clique.get(j)));
|
56 |
+
newCluster.add(Math.max(clique.get(i), clique.get(j)));
|
57 |
+
|
58 |
+
int insertIndex = -1;
|
59 |
+
for(int k = 0; k < sequentialClusters.size(); k++)
|
60 |
+
{
|
61 |
+
if (newCluster.get(newCluster.size() - 1) < sequentialClusters.get(k).get(0))
|
62 |
+
{
|
63 |
+
insertIndex = k;
|
64 |
+
break;
|
65 |
+
}
|
66 |
+
}
|
67 |
+
|
68 |
+
if (insertIndex >= 0) sequentialClusters.add(insertIndex, newCluster);
|
69 |
+
else sequentialClusters.add(newCluster);
|
70 |
+
|
71 |
+
change = true;
|
72 |
+
}
|
73 |
+
}
|
74 |
+
|
75 |
+
// one node is in one cluster, the other isn't
|
76 |
+
else
|
77 |
+
{
|
78 |
+
List<Integer> cluster = existingClusterFirst.isPresent() ? existingClusterFirst.get() : existingClusterSecond.get();
|
79 |
+
int node = existingClusterFirst.isPresent() ? clique.get(j) : clique.get(i);
|
80 |
+
|
81 |
+
if ((node == cluster.get(0) - 1) || (node == cluster.get(cluster.size()-1) + 1))
|
82 |
+
{
|
83 |
+
cluster.add(node);
|
84 |
+
cluster.sort((e1, e2) -> e1 < e2 ? -1 : (e1 > e2 ? 1 : 0));
|
85 |
+
|
86 |
+
change = true;
|
87 |
+
}
|
88 |
+
}
|
89 |
+
}
|
90 |
+
}
|
91 |
+
}
|
92 |
+
}
|
93 |
+
}
|
94 |
+
|
95 |
+
private List<Integer> computeSingletons(List<List<Integer>> cliques, List<List<Integer>> sequentialClusters)
|
96 |
+
{
|
97 |
+
List<Integer> singletons = new ArrayList<Integer>();
|
98 |
+
for(List<Integer> c : cliques)
|
99 |
+
{
|
100 |
+
for(int n : c)
|
101 |
+
{
|
102 |
+
if (!sequentialClusters.stream().anyMatch(sc -> sc.contains(n))) singletons.add(n);
|
103 |
+
}
|
104 |
+
}
|
105 |
+
|
106 |
+
singletons = singletons.stream().distinct().collect(Collectors.toList());
|
107 |
+
singletons.sort((s1, s2) -> s1 < s2 ? -1 : (s1 > s2 ? 1 : 0));
|
108 |
+
return singletons;
|
109 |
+
}
|
110 |
+
|
111 |
+
private void mergeTooSmallSequences(List<List<Integer>> sequentialClusters, Map<Integer, Map<Integer, Double>> allSimilarities, int largestSmallCluster)
|
112 |
+
{
|
113 |
+
boolean change = true;
|
114 |
+
while(change)
|
115 |
+
{
|
116 |
+
change = false;
|
117 |
+
Optional<List<Integer>> firstSmallCluster = sequentialClusters.stream().filter(c -> c.size() <= largestSmallCluster).findFirst();
|
118 |
+
if (firstSmallCluster.isPresent())
|
119 |
+
{
|
120 |
+
int i = sequentialClusters.indexOf(firstSmallCluster.get());
|
121 |
+
double similarityPrevious = (i == 0) ? 0 : averageClusterSimilarity(sequentialClusters.get(i-1), sequentialClusters.get(i), allSimilarities);
|
122 |
+
double similarityNext = (i == (sequentialClusters.size() - 1)) ? 0 : averageClusterSimilarity(sequentialClusters.get(i), sequentialClusters.get(i+1), allSimilarities);
|
123 |
+
|
124 |
+
List<Integer> clusterToMergeWith = (similarityPrevious > similarityNext) ? sequentialClusters.get(i-1) : sequentialClusters.get(i+1);
|
125 |
+
List<Integer> newCluster = new ArrayList<Integer>();
|
126 |
+
newCluster.addAll(clusterToMergeWith);
|
127 |
+
newCluster.addAll(sequentialClusters.get(i));
|
128 |
+
newCluster.sort((i1, i2) -> i1 > i2 ? 1 : (i1 < i2 ? -1 : 0));
|
129 |
+
|
130 |
+
sequentialClusters.add((similarityPrevious > similarityNext) ? i-1 : i, newCluster);
|
131 |
+
sequentialClusters.remove(firstSmallCluster.get());
|
132 |
+
sequentialClusters.remove(clusterToMergeWith);
|
133 |
+
|
134 |
+
change = true;
|
135 |
+
}
|
136 |
+
}
|
137 |
+
}
|
138 |
+
|
139 |
+
private double averageClusterSimilarity(List<Integer> first, List<Integer> second, Map<Integer, Map<Integer, Double>> allSimilarities)
|
140 |
+
{
|
141 |
+
double sum = 0;
|
142 |
+
for(int i = 0; i < first.size(); i++)
|
143 |
+
{
|
144 |
+
for(int j = 0; j < second.size(); j++)
|
145 |
+
{
|
146 |
+
sum += allSimilarities.get(Math.min(first.get(i), second.get(j))).get(Math.max(first.get(i), second.get(j)));
|
147 |
+
}
|
148 |
+
}
|
149 |
+
return sum / ((double)(first.size() * second.size()));
|
150 |
+
}
|
151 |
+
|
152 |
+
private void mergeSingletons(List<List<Integer>> cliques, List<List<Integer>> sequentialClusters, Map<Integer, Map<Integer, Double>> allSimilarities)
|
153 |
+
{
|
154 |
+
List<Integer> singletons = computeSingletons(cliques, sequentialClusters);
|
155 |
+
|
156 |
+
while(singletons.size() > 0)
|
157 |
+
{
|
158 |
+
if (singletons.size() % 10 == 0) System.out.println("Remaining singletons: " + singletons.size());
|
159 |
+
|
160 |
+
int node = singletons.get(0);
|
161 |
+
Optional<List<Integer>> previousNodeCluster = sequentialClusters.stream().filter(sc -> sc.contains(node - 1)).findFirst();
|
162 |
+
Optional<List<Integer>> nextNodeCluster = sequentialClusters.stream().filter(sc -> sc.contains(node + 1)).findFirst();
|
163 |
+
|
164 |
+
double similarityPrevious = node == 0 ? -1.0 : (previousNodeCluster.isPresent() ? similarityNodeCluster(node, previousNodeCluster.get(), allSimilarities) : allSimilarities.get(node - 1).get(node));
|
165 |
+
double similarityNext = node == allSimilarities.size() ? -1.0 : (nextNodeCluster.isPresent() ? similarityNodeCluster(node, nextNodeCluster.get(), allSimilarities) : allSimilarities.get(node).get(node + 1));
|
166 |
+
|
167 |
+
boolean previous = similarityPrevious >= similarityNext;
|
168 |
+
boolean mergeWithCluster = previous ? previousNodeCluster.isPresent() : nextNodeCluster.isPresent();
|
169 |
+
|
170 |
+
if (mergeWithCluster)
|
171 |
+
{
|
172 |
+
if (previous) previousNodeCluster.get().add(node);
|
173 |
+
else nextNodeCluster.get().add(0, node);
|
174 |
+
}
|
175 |
+
else
|
176 |
+
{
|
177 |
+
List<Integer> newCluster = new ArrayList<Integer>();
|
178 |
+
newCluster.add(previous ? node - 1 : node);
|
179 |
+
newCluster.add(previous ? node : node + 1);
|
180 |
+
|
181 |
+
int insertIndex = -1;
|
182 |
+
|
183 |
+
for(int k = 0; k < sequentialClusters.size(); k++)
|
184 |
+
{
|
185 |
+
if (newCluster.get(newCluster.size() - 1) < sequentialClusters.get(k).get(0))
|
186 |
+
{
|
187 |
+
insertIndex = k;
|
188 |
+
break;
|
189 |
+
}
|
190 |
+
}
|
191 |
+
|
192 |
+
if (insertIndex >= 0) sequentialClusters.add(insertIndex, newCluster);
|
193 |
+
else sequentialClusters.add(newCluster);
|
194 |
+
}
|
195 |
+
|
196 |
+
singletons = computeSingletons(cliques, sequentialClusters);
|
197 |
+
}
|
198 |
+
}
|
199 |
+
|
200 |
+
private double similarityNodeCluster(int node, List<Integer> cluster, Map<Integer, Map<Integer, Double>> allSimilarities)
|
201 |
+
{
|
202 |
+
double average = 0;
|
203 |
+
for(Integer n2 : cluster) average += allSimilarities.get(Math.min(node, n2)).get(Math.max(node, n2));
|
204 |
+
return average /((double)cluster.size());
|
205 |
+
}
|
206 |
+
}
|
source/src/edu/uma/nlp/graphseg/GraphHandler.java
ADDED
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
package edu.uma.nlp.graphseg;
|
2 |
+
|
3 |
+
import java.util.ArrayList;
|
4 |
+
import java.util.HashMap;
|
5 |
+
import java.util.List;
|
6 |
+
import java.util.Map;
|
7 |
+
import java.util.stream.Collectors;
|
8 |
+
|
9 |
+
import org.jgrapht.UndirectedGraph;
|
10 |
+
import org.jgrapht.alg.BronKerboschCliqueFinder;
|
11 |
+
import org.jgrapht.alg.ConnectivityInspector;
|
12 |
+
import org.jgrapht.alg.KuhnMunkresMinimalWeightBipartitePerfectMatching;
|
13 |
+
import org.jgrapht.generate.SimpleWeightedBipartiteGraphMatrixGenerator;
|
14 |
+
import org.jgrapht.generate.WeightedGraphGeneratorAdapter;
|
15 |
+
import org.jgrapht.graph.DefaultEdge;
|
16 |
+
import org.jgrapht.graph.DefaultWeightedEdge;
|
17 |
+
import org.jgrapht.graph.SimpleGraph;
|
18 |
+
import org.jgrapht.graph.SimpleWeightedGraph;
|
19 |
+
|
20 |
+
import edu.uma.nlp.graphseg.preprocessing.Document;
|
21 |
+
import edu.uma.nlp.graphseg.preprocessing.TokenAnnotation;
|
22 |
+
import edu.uma.nlp.graphseg.utils.MemoryStorage;
|
23 |
+
|
24 |
+
public class GraphHandler {
|
25 |
+
|
26 |
+
private static List<String> stopwords;
|
27 |
+
public static void setStopwords(List<String> stwrds)
|
28 |
+
{
|
29 |
+
stopwords = stwrds;
|
30 |
+
}
|
31 |
+
|
32 |
+
private static Map<Integer, Map<Integer, Double>> allSimilarities;
|
33 |
+
public static Map<Integer, Map<Integer, Double>> getAllSimilarities()
|
34 |
+
{
|
35 |
+
return allSimilarities;
|
36 |
+
}
|
37 |
+
|
38 |
+
public static UndirectedGraph<Integer, DefaultEdge> constructGraph(List<Document> snippets, double similarityTreshold)
|
39 |
+
{
|
40 |
+
int localizationSize = 100;
|
41 |
+
allSimilarities = new HashMap<Integer, Map<Integer, Double>>();
|
42 |
+
|
43 |
+
UndirectedGraph<Integer, DefaultEdge> graph = new SimpleGraph<Integer, DefaultEdge>(DefaultEdge.class);
|
44 |
+
|
45 |
+
snippets.forEach(s -> graph.addVertex(Integer.parseInt(s.getId())));
|
46 |
+
for(int i = 0; i < snippets.size() - 1; i++)
|
47 |
+
{
|
48 |
+
if (i % 10 == 0) System.out.println("Constructing graph, outer loop " + i + "/" + snippets.size());
|
49 |
+
allSimilarities.put(i, new HashMap<Integer, Double>());
|
50 |
+
|
51 |
+
for(int j = i + 1; j < Math.min(snippets.size(), i + localizationSize); j++)
|
52 |
+
{
|
53 |
+
List<TokenAnnotation> contentTokenFirst = snippets.get(i).getTokens().stream().filter(t -> t.getPartOfSpeech().isContent() && !stopwords.contains(t.getLemma().toLowerCase())).collect(Collectors.toList());
|
54 |
+
List<TokenAnnotation> contentTokenSecond = snippets.get(j).getTokens().stream().filter(t -> t.getPartOfSpeech().isContent() && !stopwords.contains(t.getLemma().toLowerCase())).collect(Collectors.toList());
|
55 |
+
|
56 |
+
if (contentTokenFirst.size() == 0 || contentTokenSecond.size() == 0)
|
57 |
+
{
|
58 |
+
allSimilarities.get(i).put(j, 0.0);
|
59 |
+
continue;
|
60 |
+
}
|
61 |
+
|
62 |
+
// preparing for bipartite graph min matching
|
63 |
+
double[][] dissimilarities = new double[Math.max(contentTokenFirst.size(), contentTokenSecond.size())][Math.max(contentTokenFirst.size(), contentTokenSecond.size())];
|
64 |
+
List<Integer> firstPartition = new ArrayList<Integer>();
|
65 |
+
List<Integer> secondPartition = new ArrayList<Integer>();
|
66 |
+
|
67 |
+
for(int k = 0; k < Math.max(contentTokenFirst.size(), contentTokenSecond.size()); k++)
|
68 |
+
{
|
69 |
+
for(int l = 0; l < Math.max(contentTokenFirst.size(), contentTokenSecond.size()); l++)
|
70 |
+
{
|
71 |
+
if (k >= contentTokenFirst.size() || l >= contentTokenSecond.size())
|
72 |
+
{
|
73 |
+
dissimilarities[k][l] = 1;
|
74 |
+
}
|
75 |
+
else
|
76 |
+
{
|
77 |
+
double icFactor = Math.max(MemoryStorage.getInformationContent().getRelativeInformationContent(contentTokenFirst.get(k).getLemma().toLowerCase()), MemoryStorage.getInformationContent().getRelativeInformationContent(contentTokenSecond.get(l).getLemma().toLowerCase()));
|
78 |
+
double simTokens = MemoryStorage.getWordVectorSpace().similarity(contentTokenFirst.get(k).getLemma().toLowerCase(), contentTokenSecond.get(l).getLemma().toLowerCase());
|
79 |
+
if (simTokens < 0) simTokens = 0;
|
80 |
+
|
81 |
+
dissimilarities[k][l] = 1 - icFactor * simTokens;
|
82 |
+
}
|
83 |
+
}
|
84 |
+
}
|
85 |
+
for(int z = 0; z < Math.max(contentTokenFirst.size(), contentTokenSecond.size()); z++)
|
86 |
+
{
|
87 |
+
firstPartition.add(z);
|
88 |
+
secondPartition.add(z + Math.max(contentTokenFirst.size(), contentTokenSecond.size()));
|
89 |
+
}
|
90 |
+
|
91 |
+
double bmScore = minimumAverageBipartiteGraphMatchingScore(dissimilarities, firstPartition, secondPartition) - (Math.abs(contentTokenFirst.size() - contentTokenSecond.size()));
|
92 |
+
double similarityNonNormalized = Math.min(contentTokenFirst.size(), contentTokenSecond.size()) - bmScore;
|
93 |
+
double similarity = ((similarityNonNormalized / contentTokenFirst.size()) + (similarityNonNormalized / contentTokenSecond.size())) / 2.0;
|
94 |
+
|
95 |
+
//double similarity = SemanticSimilarity.greedyAlignmentOverlapFScore(snippets.get(i).getTokens(), snippets.get(j).getTokens(), MemoryStorage.getWordVectorSpace(), MemoryStorage.getInformationContent(), true);
|
96 |
+
allSimilarities.get(i).put(j, similarity);
|
97 |
+
|
98 |
+
if (similarity > similarityTreshold)
|
99 |
+
{
|
100 |
+
graph.addEdge(Integer.parseInt(snippets.get(i).getId()), Integer.parseInt(snippets.get(j).getId()));
|
101 |
+
}
|
102 |
+
}
|
103 |
+
}
|
104 |
+
|
105 |
+
return graph;
|
106 |
+
}
|
107 |
+
|
108 |
+
public static double minimumAverageBipartiteGraphMatchingScore(double[][] dissimilarities, List<Integer> firstPartition, List<Integer> secondPartition)
|
109 |
+
{
|
110 |
+
SimpleWeightedGraph<Integer, DefaultWeightedEdge> bipartiteGraph = new SimpleWeightedGraph<>(DefaultWeightedEdge.class);
|
111 |
+
WeightedGraphGeneratorAdapter<Integer, DefaultWeightedEdge, Integer> generator =
|
112 |
+
new SimpleWeightedBipartiteGraphMatrixGenerator<Integer, DefaultWeightedEdge>()
|
113 |
+
.first (firstPartition)
|
114 |
+
.second (secondPartition)
|
115 |
+
.weights(dissimilarities);
|
116 |
+
|
117 |
+
generator.generateGraph(bipartiteGraph, null, null);
|
118 |
+
KuhnMunkresMinimalWeightBipartitePerfectMatching<Integer, DefaultWeightedEdge> bipartiteMatching = new KuhnMunkresMinimalWeightBipartitePerfectMatching<Integer, DefaultWeightedEdge>(bipartiteGraph, firstPartition, secondPartition);
|
119 |
+
|
120 |
+
return bipartiteMatching.getMatchingWeight();
|
121 |
+
}
|
122 |
+
|
123 |
+
public static List<List<Integer>> getAllCliques(UndirectedGraph<Integer, DefaultEdge> graph)
|
124 |
+
{
|
125 |
+
BronKerboschCliqueFinder<Integer, DefaultEdge> finder = new BronKerboschCliqueFinder<Integer, DefaultEdge>(graph);
|
126 |
+
return finder.getAllMaximalCliques().stream().map(x -> x.stream().collect(Collectors.toList())).collect(Collectors.toList());
|
127 |
+
}
|
128 |
+
|
129 |
+
public static List<List<Integer>> getAllConnectedComponents(UndirectedGraph<Integer, DefaultEdge> graph)
|
130 |
+
{
|
131 |
+
ConnectivityInspector<Integer, DefaultEdge> finder = new ConnectivityInspector<Integer, DefaultEdge>(graph);
|
132 |
+
return finder.connectedSets().stream().map(x -> x.stream().collect(Collectors.toList())).collect(Collectors.toList());
|
133 |
+
}
|
134 |
+
}
|
source/src/edu/uma/nlp/graphseg/IOHandler.java
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
package edu.uma.nlp.graphseg;
|
2 |
+
|
3 |
+
import java.io.BufferedWriter;
|
4 |
+
import java.io.File;
|
5 |
+
import java.io.FileOutputStream;
|
6 |
+
import java.io.OutputStreamWriter;
|
7 |
+
import java.util.List;
|
8 |
+
|
9 |
+
public class IOHandler {
|
10 |
+
public static void writeSegmentation(List<String> rawLines, List<List<Integer>> segmentation, String path)
|
11 |
+
{
|
12 |
+
try {
|
13 |
+
File fout = new File(path);
|
14 |
+
FileOutputStream fos;
|
15 |
+
fos = new FileOutputStream(fout);
|
16 |
+
BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(fos));
|
17 |
+
|
18 |
+
for(int i = 0; i < segmentation.size(); i++)
|
19 |
+
{
|
20 |
+
for(int j = 0; j < segmentation.get(i).size(); j++)
|
21 |
+
{
|
22 |
+
bw.write(rawLines.get(segmentation.get(i).get(j)) + "\n");
|
23 |
+
}
|
24 |
+
bw.write("==========\n");
|
25 |
+
}
|
26 |
+
bw.close();
|
27 |
+
}
|
28 |
+
catch (Exception e) {
|
29 |
+
// TODO Auto-generated catch block
|
30 |
+
e.printStackTrace();
|
31 |
+
}
|
32 |
+
}
|
33 |
+
}
|
source/src/edu/uma/nlp/graphseg/STSHandler.java
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
package edu.uma.nlp.graphseg;
|
2 |
+
|
3 |
+
import java.util.ArrayList;
|
4 |
+
import java.util.List;
|
5 |
+
|
6 |
+
import org.javatuples.Triplet;
|
7 |
+
|
8 |
+
import edu.uma.nlp.graphseg.preprocessing.Document;
|
9 |
+
import edu.uma.nlp.graphseg.semantics.InformationContent;
|
10 |
+
import edu.uma.nlp.graphseg.semantics.SemanticSimilarity;
|
11 |
+
import edu.uma.nlp.graphseg.semantics.WordVectorSpace;
|
12 |
+
|
13 |
+
|
14 |
+
public class STSHandler {
|
15 |
+
public static List<Triplet<Document, Document, Double>> getSemanticSimilarities(List<Document> snippets, double simTreshold, WordVectorSpace vectorSpace, InformationContent informationContent)
|
16 |
+
{
|
17 |
+
List<Triplet<Document, Document, Double>> similarityGraph = new ArrayList<Triplet<Document, Document, Double>>();
|
18 |
+
|
19 |
+
for(int i = 0; i < snippets.size() - 1; i++)
|
20 |
+
{
|
21 |
+
System.out.println("Outer loop: " + String.valueOf(i+1) + "/" + String.valueOf(snippets.size() - 1));
|
22 |
+
for(int j = i + 1; j < snippets.size(); j++)
|
23 |
+
{
|
24 |
+
//if (j % 100 == 0) System.out.println("Inner loop: " + String.valueOf(j+1) + "/" + String.valueOf(snippets.size()));
|
25 |
+
|
26 |
+
double similarity = SemanticSimilarity.greedyAlignmentOverlapFScore(snippets.get(i).getTokens(), snippets.get(j).getTokens(), vectorSpace, informationContent, true);
|
27 |
+
if (similarity > simTreshold)
|
28 |
+
{
|
29 |
+
similarityGraph.add(new Triplet<Document, Document, Double>(snippets.get(i), snippets.get(j), similarity));
|
30 |
+
}
|
31 |
+
}
|
32 |
+
}
|
33 |
+
|
34 |
+
similarityGraph.sort((i1, i2) -> i1.getValue2() > i2.getValue2() ? -1 : (i1.getValue2() < i2.getValue2() ? 1 : 0));
|
35 |
+
return similarityGraph;
|
36 |
+
}
|
37 |
+
}
|
source/src/edu/uma/nlp/graphseg/Start.java
ADDED
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
package edu.uma.nlp.graphseg;
|
2 |
+
|
3 |
+
import java.io.File;
|
4 |
+
import java.io.IOException;
|
5 |
+
import java.nio.file.Files;
|
6 |
+
import java.nio.file.Path;
|
7 |
+
import java.nio.file.Paths;
|
8 |
+
import java.util.ArrayList;
|
9 |
+
import java.util.Arrays;
|
10 |
+
import java.util.List;
|
11 |
+
import java.util.stream.Collectors;
|
12 |
+
|
13 |
+
import org.apache.commons.io.FileUtils;
|
14 |
+
import org.jgrapht.UndirectedGraph;
|
15 |
+
import org.jgrapht.graph.DefaultEdge;
|
16 |
+
|
17 |
+
import edu.uma.nlp.graphseg.preprocessing.Document;
|
18 |
+
import edu.uma.nlp.graphseg.preprocessing.StanfordAnnotator;
|
19 |
+
import edu.uma.nlp.graphseg.semantics.InformationContent;
|
20 |
+
import edu.uma.nlp.graphseg.semantics.SemanticSimilarity;
|
21 |
+
import edu.uma.nlp.graphseg.semantics.WordVectorSpace;
|
22 |
+
import edu.uma.nlp.graphseg.utils.ApplicationConfiguration;
|
23 |
+
import edu.uma.nlp.graphseg.utils.IOHelper;
|
24 |
+
import edu.uma.nlp.graphseg.utils.MemoryStorage;
|
25 |
+
|
26 |
+
public class Start {
|
27 |
+
|
28 |
+
public static void main(String[] args) throws NumberFormatException, IOException {
|
29 |
+
// TODO Auto-generated method stub
|
30 |
+
|
31 |
+
// checking the arguments
|
32 |
+
if (args.length < 4)
|
33 |
+
{
|
34 |
+
System.out.println("USAGE: java -jar graphseg.jar <input-dir> <output-dir> <rel-treshold> <min-segment>");
|
35 |
+
return;
|
36 |
+
}
|
37 |
+
|
38 |
+
File inputDirFile = new File(args[0]);
|
39 |
+
File outputDirFile = new File(args[1]);
|
40 |
+
|
41 |
+
if (!inputDirFile.exists() || !outputDirFile.exists() || !inputDirFile.isDirectory() || !outputDirFile.isDirectory())
|
42 |
+
{
|
43 |
+
System.out.println("USAGE: java -jar graphseg.jar <input-dir> <output-dir> <rel-treshold (double, <0,1>)> <min-segment (int)>");
|
44 |
+
return;
|
45 |
+
}
|
46 |
+
|
47 |
+
double treshold = 0;
|
48 |
+
try
|
49 |
+
{
|
50 |
+
treshold = Double.parseDouble(args[2]);
|
51 |
+
if (treshold < 0 || treshold > 1)
|
52 |
+
{
|
53 |
+
throw new UnsupportedOperationException();
|
54 |
+
}
|
55 |
+
}
|
56 |
+
catch(NumberFormatException ex)
|
57 |
+
{
|
58 |
+
System.out.println("USAGE: java -jar graphseg.jar <input-dir> <output-dir> <rel-treshold (double, <0,1>)> <min-segment (int)>");
|
59 |
+
return;
|
60 |
+
}
|
61 |
+
|
62 |
+
int minseg = 0;
|
63 |
+
try
|
64 |
+
{
|
65 |
+
minseg = Integer.parseInt(args[3]);
|
66 |
+
if (minseg < 1)
|
67 |
+
{
|
68 |
+
throw new UnsupportedOperationException();
|
69 |
+
}
|
70 |
+
}
|
71 |
+
catch(NumberFormatException ex)
|
72 |
+
{
|
73 |
+
System.out.println("USAGE: java -jar graphseg.jar <input-dir> <output-dir> <rel-treshold (double, <0,1>)> <min-segment (int, >=1)>");
|
74 |
+
return;
|
75 |
+
}
|
76 |
+
|
77 |
+
List<String> stopwords = IOHelper.getAllLines(ApplicationConfiguration.config.getValue("stop-words-path"));
|
78 |
+
MemoryStorage.setWordVectorSpace(new WordVectorSpace());
|
79 |
+
MemoryStorage.getWordVectorSpace().load(ApplicationConfiguration.config.getValue("word-vec-path"), null);
|
80 |
+
|
81 |
+
MemoryStorage.setInformationContent(new InformationContent(ApplicationConfiguration.config.getValue("inf-cont-path"), 1));
|
82 |
+
|
83 |
+
|
84 |
+
SemanticSimilarity.setStopwords(stopwords);
|
85 |
+
GraphHandler.setStopwords(stopwords);
|
86 |
+
|
87 |
+
StanfordAnnotator annotator = new StanfordAnnotator();
|
88 |
+
|
89 |
+
|
90 |
+
for(Path file : Files.walk(Paths.get(args[0])).filter(x -> (new File(x.toString()).isFile())).collect(Collectors.toList()))
|
91 |
+
{
|
92 |
+
System.out.println("Segmenting file: " + file.toString());
|
93 |
+
|
94 |
+
annotator.setStanfordAnnotators(new ArrayList<String>(Arrays.asList("tokenize", "ssplit")));
|
95 |
+
|
96 |
+
String content = FileUtils.readFileToString(new File(file.toString()));
|
97 |
+
Document doc = new Document();
|
98 |
+
doc.setText(content);
|
99 |
+
annotator.annotate(doc);
|
100 |
+
|
101 |
+
annotator.setStanfordAnnotators(new ArrayList<String>(Arrays.asList("tokenize", "ssplit", "pos", "lemma")));
|
102 |
+
|
103 |
+
List<Document> snippets = new ArrayList<Document>();
|
104 |
+
for(int i = 0; i < doc.getSentences().size(); i++)
|
105 |
+
{
|
106 |
+
Document snippet = new Document(doc.getSentences().get(i).getText());
|
107 |
+
annotator.annotate(snippet);
|
108 |
+
snippet.setId(String.valueOf(i));
|
109 |
+
snippets.add(snippet);
|
110 |
+
}
|
111 |
+
|
112 |
+
UndirectedGraph<Integer, DefaultEdge> graph = GraphHandler.constructGraph(snippets, treshold);
|
113 |
+
System.out.println("Computing cliques...");
|
114 |
+
List<List<Integer>> cliques = GraphHandler.getAllCliques(graph);
|
115 |
+
|
116 |
+
ClusteringHandler clusterer = new ClusteringHandler();
|
117 |
+
System.out.println("Constructing linear segments...");
|
118 |
+
List<List<Integer>> sequentialClusters = clusterer.getSequentialClusters(cliques, GraphHandler.getAllSimilarities(), minseg);
|
119 |
+
IOHandler.writeSegmentation(doc.getSentences().stream().map(x -> x.getText()).collect(Collectors.toList()), sequentialClusters, args[1] + (args[1].endsWith("/") ? "" : "/") + file.getFileName().toString());
|
120 |
+
}
|
121 |
+
}
|
122 |
+
}
|
source/src/edu/uma/nlp/graphseg/preprocessing/Annotation.java
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
package edu.uma.nlp.graphseg.preprocessing;
|
2 |
+
|
3 |
+
import java.util.ArrayList;
|
4 |
+
import java.util.HashMap;
|
5 |
+
import java.util.List;
|
6 |
+
|
7 |
+
public class Annotation {
|
8 |
+
|
9 |
+
protected HashMap<AnnotationType, List<Annotation>> childAnnotations;
|
10 |
+
|
11 |
+
public Annotation()
|
12 |
+
{
|
13 |
+
childAnnotations = new HashMap<AnnotationType, List<Annotation>>();
|
14 |
+
}
|
15 |
+
|
16 |
+
public List<Annotation> getChildAnnotations(AnnotationType type)
|
17 |
+
{
|
18 |
+
if (childAnnotations.containsKey(type)) return childAnnotations.get(type);
|
19 |
+
else return new ArrayList<Annotation>();
|
20 |
+
}
|
21 |
+
|
22 |
+
public void addChildAnnotation(Annotation annotation, AnnotationType type)
|
23 |
+
{
|
24 |
+
if (!childAnnotations.containsKey(type)) childAnnotations.put(type, new ArrayList<Annotation>());
|
25 |
+
childAnnotations.get(type).add(annotation);
|
26 |
+
}
|
27 |
+
|
28 |
+
public void removeChildAnnotation(Annotation annotation)
|
29 |
+
{
|
30 |
+
if (childAnnotations.containsKey(annotation))
|
31 |
+
{
|
32 |
+
childAnnotations.remove(annotation);
|
33 |
+
}
|
34 |
+
}
|
35 |
+
|
36 |
+
}
|
source/src/edu/uma/nlp/graphseg/preprocessing/AnnotationType.java
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
package edu.uma.nlp.graphseg.preprocessing;
|
2 |
+
|
3 |
+
public enum AnnotationType {
|
4 |
+
Corpus,
|
5 |
+
Document,
|
6 |
+
SentenceAnnotation,
|
7 |
+
TokenAnnotation,
|
8 |
+
MorphologyAnnotation,
|
9 |
+
PartOfSpeechAnnotation,
|
10 |
+
NamedEntityAnnotation,
|
11 |
+
NamedEntityTokenAnnotation,
|
12 |
+
ChunkAnnotation,
|
13 |
+
DependencyAnnotation
|
14 |
+
}
|
source/src/edu/uma/nlp/graphseg/preprocessing/AnnotatorChain.java
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
package edu.uma.nlp.graphseg.preprocessing;
|
2 |
+
|
3 |
+
import java.util.ArrayList;
|
4 |
+
import java.util.List;
|
5 |
+
|
6 |
+
public class AnnotatorChain {
|
7 |
+
|
8 |
+
private List<IAnnotator> chain;
|
9 |
+
|
10 |
+
public AnnotatorChain()
|
11 |
+
{
|
12 |
+
}
|
13 |
+
|
14 |
+
|
15 |
+
public AnnotatorChain(List<IAnnotator> annotators)
|
16 |
+
{
|
17 |
+
chain = annotators;
|
18 |
+
}
|
19 |
+
|
20 |
+
public AnnotatorChain addAnnotator(IAnnotator annotator)
|
21 |
+
{
|
22 |
+
if (chain == null) chain = new ArrayList<IAnnotator>();
|
23 |
+
chain.add(annotator);
|
24 |
+
return this;
|
25 |
+
}
|
26 |
+
|
27 |
+
public void apply(Annotation textUnit)
|
28 |
+
{
|
29 |
+
for (int i = 0; i < chain.size(); i++)
|
30 |
+
{
|
31 |
+
chain.get(i).annotate(textUnit);
|
32 |
+
}
|
33 |
+
}
|
34 |
+
|
35 |
+
}
|
source/src/edu/uma/nlp/graphseg/preprocessing/AnnotatorType.java
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
package edu.uma.nlp.graphseg.preprocessing;
|
2 |
+
|
3 |
+
public enum AnnotatorType
|
4 |
+
{
|
5 |
+
SentenceSplitter,
|
6 |
+
Tokenizer,
|
7 |
+
POSTagger,
|
8 |
+
Morphology,
|
9 |
+
NamedEntityExtractor,
|
10 |
+
Chunker
|
11 |
+
}
|
source/src/edu/uma/nlp/graphseg/preprocessing/Document.java
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
package edu.uma.nlp.graphseg.preprocessing;
|
2 |
+
|
3 |
+
import java.util.ArrayList;
|
4 |
+
import java.util.List;
|
5 |
+
import java.util.stream.Collectors;
|
6 |
+
|
7 |
+
public class Document extends Annotation {
|
8 |
+
|
9 |
+
private String id;
|
10 |
+
|
11 |
+
public String getId() {
|
12 |
+
return id;
|
13 |
+
}
|
14 |
+
|
15 |
+
public void setId(String id) {
|
16 |
+
this.id = id;
|
17 |
+
}
|
18 |
+
|
19 |
+
private String path;
|
20 |
+
|
21 |
+
public String getPath() {
|
22 |
+
return path;
|
23 |
+
}
|
24 |
+
|
25 |
+
public void setPath(String path) {
|
26 |
+
this.path = path;
|
27 |
+
}
|
28 |
+
|
29 |
+
private String text;
|
30 |
+
|
31 |
+
public String getText() {
|
32 |
+
return text;
|
33 |
+
}
|
34 |
+
|
35 |
+
public void setText(String text) {
|
36 |
+
this.text = text;
|
37 |
+
}
|
38 |
+
|
39 |
+
|
40 |
+
// Lazy loading
|
41 |
+
|
42 |
+
private List<TokenAnnotation> tokens;
|
43 |
+
|
44 |
+
public List<TokenAnnotation> getTokens() {
|
45 |
+
if (tokens == null) tokens = loadTokens();
|
46 |
+
return tokens;
|
47 |
+
}
|
48 |
+
|
49 |
+
private List<SentenceAnnotation> sentences;
|
50 |
+
|
51 |
+
public List<SentenceAnnotation> getSentences() {
|
52 |
+
if (sentences == null) sentences = loadSentences();
|
53 |
+
return sentences;
|
54 |
+
}
|
55 |
+
|
56 |
+
private List<NamedEntityAnnotation> namedEntities;
|
57 |
+
|
58 |
+
public List<NamedEntityAnnotation> getNamedEntities() {
|
59 |
+
if (namedEntities == null) namedEntities = loadNamedEntities();
|
60 |
+
return namedEntities;
|
61 |
+
}
|
62 |
+
|
63 |
+
private List<TokenAnnotation> loadTokens()
|
64 |
+
{
|
65 |
+
if (getSentences() != null)
|
66 |
+
{
|
67 |
+
List<TokenAnnotation> toks = new ArrayList<TokenAnnotation>();
|
68 |
+
for (int i = 0; i < sentences.size(); i++)
|
69 |
+
{
|
70 |
+
toks.addAll(sentences.get(i).getTokens());
|
71 |
+
}
|
72 |
+
|
73 |
+
toks.sort((t1, t2) -> (t1.getStartPosition() < t2.getStartPosition()) ? -1 : ((t1.getStartPosition() > t2.getStartPosition()) ? 1 : 0));
|
74 |
+
return toks;
|
75 |
+
}
|
76 |
+
else return null;
|
77 |
+
|
78 |
+
}
|
79 |
+
|
80 |
+
private List<SentenceAnnotation> loadSentences()
|
81 |
+
{
|
82 |
+
if (childAnnotations.containsKey(AnnotationType.SentenceAnnotation))
|
83 |
+
return childAnnotations.get(AnnotationType.SentenceAnnotation).stream().map(x -> (SentenceAnnotation)x).collect(Collectors.toList());
|
84 |
+
else return null;
|
85 |
+
}
|
86 |
+
|
87 |
+
private List<NamedEntityAnnotation> loadNamedEntities()
|
88 |
+
{
|
89 |
+
if (childAnnotations.containsKey(AnnotationType.NamedEntityAnnotation))
|
90 |
+
return childAnnotations.get(AnnotationType.NamedEntityAnnotation).stream().map(x -> (NamedEntityAnnotation)x).collect(Collectors.toList());
|
91 |
+
else return null;
|
92 |
+
}
|
93 |
+
|
94 |
+
// Ctors
|
95 |
+
|
96 |
+
public Document()
|
97 |
+
{
|
98 |
+
}
|
99 |
+
|
100 |
+
public Document(String text)
|
101 |
+
{
|
102 |
+
this.text = text;
|
103 |
+
}
|
104 |
+
|
105 |
+
public Document(String id, String text)
|
106 |
+
{
|
107 |
+
this.id = id;
|
108 |
+
this.text = text;
|
109 |
+
}
|
110 |
+
}
|
source/src/edu/uma/nlp/graphseg/preprocessing/IAnnotator.java
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
package edu.uma.nlp.graphseg.preprocessing;
|
2 |
+
|
3 |
+
import java.util.List;
|
4 |
+
|
5 |
+
public interface IAnnotator
|
6 |
+
{
|
7 |
+
void annotate(Annotation textUnit);
|
8 |
+
List<Annotation> annotate(String text);
|
9 |
+
}
|
source/src/edu/uma/nlp/graphseg/preprocessing/NamedEntityAnnotation.java
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
package edu.uma.nlp.graphseg.preprocessing;
|
2 |
+
|
3 |
+
import java.util.List;
|
4 |
+
import java.util.stream.Collectors;
|
5 |
+
|
6 |
+
import org.apache.commons.lang3.StringUtils;
|
7 |
+
|
8 |
+
public class NamedEntityAnnotation extends Annotation {
|
9 |
+
|
10 |
+
private NamedEntityType namedEntityType;
|
11 |
+
|
12 |
+
public NamedEntityType getNamedEntityType() {
|
13 |
+
return namedEntityType;
|
14 |
+
}
|
15 |
+
|
16 |
+
public void setNamedEntityType(NamedEntityType namedEntityType) {
|
17 |
+
this.namedEntityType = namedEntityType;
|
18 |
+
}
|
19 |
+
|
20 |
+
private String text;
|
21 |
+
|
22 |
+
public String getText() {
|
23 |
+
if ((text == null || StringUtils.isEmpty(text)) && getTokens().size() > 0)
|
24 |
+
{
|
25 |
+
text = "";
|
26 |
+
for(int i = 0; i < tokens.size(); i++)
|
27 |
+
{
|
28 |
+
text += tokens.get(i).getText() + " ";
|
29 |
+
}
|
30 |
+
text = text.trim();
|
31 |
+
}
|
32 |
+
return text;
|
33 |
+
}
|
34 |
+
|
35 |
+
public void setText(String text) {
|
36 |
+
this.text = text;
|
37 |
+
}
|
38 |
+
|
39 |
+
private int startPosition;
|
40 |
+
|
41 |
+
public int getStartPosition() {
|
42 |
+
return startPosition;
|
43 |
+
}
|
44 |
+
|
45 |
+
public void setStartPosition(int startPosition) {
|
46 |
+
this.startPosition = startPosition;
|
47 |
+
}
|
48 |
+
|
49 |
+
public Boolean isPerson()
|
50 |
+
{
|
51 |
+
return namedEntityType == NamedEntityType.Person;
|
52 |
+
}
|
53 |
+
|
54 |
+
public Boolean isLocation()
|
55 |
+
{
|
56 |
+
return namedEntityType == NamedEntityType.Location;
|
57 |
+
}
|
58 |
+
|
59 |
+
public Boolean isOrganization()
|
60 |
+
{
|
61 |
+
return namedEntityType == NamedEntityType.Organization;
|
62 |
+
}
|
63 |
+
|
64 |
+
private List<TokenAnnotation> tokens;
|
65 |
+
public List<TokenAnnotation> getTokens()
|
66 |
+
{
|
67 |
+
if (tokens == null) tokens = loadTokens();
|
68 |
+
return tokens;
|
69 |
+
}
|
70 |
+
|
71 |
+
private List<TokenAnnotation> loadTokens()
|
72 |
+
{
|
73 |
+
if (childAnnotations.containsKey(AnnotationType.TokenAnnotation)) return childAnnotations.get(AnnotationType.TokenAnnotation).stream().map(x -> (TokenAnnotation)x).collect(Collectors.toList());
|
74 |
+
else return null;
|
75 |
+
}
|
76 |
+
|
77 |
+
public NamedEntityAnnotation(NamedEntityType type)
|
78 |
+
{
|
79 |
+
namedEntityType = type;
|
80 |
+
}
|
81 |
+
|
82 |
+
public NamedEntityAnnotation(NamedEntityType type, String text, int startPosition)
|
83 |
+
{
|
84 |
+
this.namedEntityType = type;
|
85 |
+
this.text = text;
|
86 |
+
this.startPosition = startPosition;
|
87 |
+
}
|
88 |
+
}
|
source/src/edu/uma/nlp/graphseg/preprocessing/NamedEntityTokenAnnotation.java
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
package edu.uma.nlp.graphseg.preprocessing;
|
2 |
+
|
3 |
+
public class NamedEntityTokenAnnotation extends Annotation {
|
4 |
+
|
5 |
+
private String namedEntityLabel;
|
6 |
+
|
7 |
+
public String getNamedEntityLabel() {
|
8 |
+
return namedEntityLabel;
|
9 |
+
}
|
10 |
+
|
11 |
+
public void setNamedEntityLabel(String namedEntityLabel) {
|
12 |
+
this.namedEntityLabel = namedEntityLabel;
|
13 |
+
}
|
14 |
+
|
15 |
+
public NamedEntityTokenAnnotation()
|
16 |
+
{
|
17 |
+
}
|
18 |
+
|
19 |
+
public NamedEntityTokenAnnotation(String label)
|
20 |
+
{
|
21 |
+
namedEntityLabel = label;
|
22 |
+
}
|
23 |
+
|
24 |
+
public Boolean constitutesNamedEntity()
|
25 |
+
{
|
26 |
+
return startsNamedEntity() || insideNamedEntity();
|
27 |
+
}
|
28 |
+
|
29 |
+
public Boolean startsNamedEntity()
|
30 |
+
{
|
31 |
+
return namedEntityLabel.startsWith("B");
|
32 |
+
}
|
33 |
+
|
34 |
+
public Boolean insideNamedEntity()
|
35 |
+
{
|
36 |
+
return namedEntityLabel.startsWith("I");
|
37 |
+
}
|
38 |
+
}
|
source/src/edu/uma/nlp/graphseg/preprocessing/NamedEntityType.java
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
package edu.uma.nlp.graphseg.preprocessing;
|
2 |
+
|
3 |
+
public enum NamedEntityType {
|
4 |
+
Person,
|
5 |
+
Location,
|
6 |
+
Organization,
|
7 |
+
Money,
|
8 |
+
Percentage,
|
9 |
+
Date,
|
10 |
+
Time,
|
11 |
+
Ordinal,
|
12 |
+
Percent,
|
13 |
+
Number,
|
14 |
+
Set,
|
15 |
+
Duration,
|
16 |
+
Misc,
|
17 |
+
None
|
18 |
+
}
|
source/src/edu/uma/nlp/graphseg/preprocessing/PartOfSpeechAnnotation.java
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
package edu.uma.nlp.graphseg.preprocessing;
|
2 |
+
|
3 |
+
import java.util.Arrays;
|
4 |
+
import java.util.List;
|
5 |
+
|
6 |
+
public class PartOfSpeechAnnotation extends Annotation {
|
7 |
+
|
8 |
+
private String tag;
|
9 |
+
|
10 |
+
public String getTag() {
|
11 |
+
return tag;
|
12 |
+
}
|
13 |
+
|
14 |
+
public void setTag(String tag) {
|
15 |
+
this.tag = tag;
|
16 |
+
}
|
17 |
+
|
18 |
+
private String chunkTag;
|
19 |
+
|
20 |
+
public String getChunkTag() {
|
21 |
+
return chunkTag;
|
22 |
+
}
|
23 |
+
|
24 |
+
public void setChunkTag(String chunkTag) {
|
25 |
+
this.chunkTag = chunkTag;
|
26 |
+
}
|
27 |
+
|
28 |
+
private String coarseTag;
|
29 |
+
public String getCoarseTag() {
|
30 |
+
if (tag != null) coarseTag = loadCoarsePoSTag();
|
31 |
+
return coarseTag;
|
32 |
+
}
|
33 |
+
|
34 |
+
private String loadCoarsePoSTag()
|
35 |
+
{
|
36 |
+
if (isNoun()) return "N";
|
37 |
+
else if (isVerb()) return "V";
|
38 |
+
else if (isAdjective()) return "J";
|
39 |
+
else if (isAdverb()) return "R";
|
40 |
+
else return "O";
|
41 |
+
}
|
42 |
+
|
43 |
+
public Boolean isContent()
|
44 |
+
{
|
45 |
+
List<String> otherContentPOS = Arrays.asList("CD", "FW", "MD", "SYM", "UH");
|
46 |
+
return isNoun() || isVerb() || isAdjective() || isAdverb() || otherContentPOS.contains(tag);
|
47 |
+
}
|
48 |
+
|
49 |
+
public Boolean isNoun()
|
50 |
+
{
|
51 |
+
return tag != null && tag.startsWith("N");
|
52 |
+
}
|
53 |
+
|
54 |
+
public Boolean isVerb()
|
55 |
+
{
|
56 |
+
return tag != null && tag.startsWith("V");
|
57 |
+
}
|
58 |
+
|
59 |
+
public Boolean isAdjective()
|
60 |
+
{
|
61 |
+
return tag != null && tag.startsWith("J");
|
62 |
+
}
|
63 |
+
|
64 |
+
public Boolean isAdverb()
|
65 |
+
{
|
66 |
+
return tag != null && tag.startsWith("RB");
|
67 |
+
}
|
68 |
+
|
69 |
+
}
|
source/src/edu/uma/nlp/graphseg/preprocessing/SentenceAnnotation.java
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
package edu.uma.nlp.graphseg.preprocessing;
|
2 |
+
|
3 |
+
import java.util.List;
|
4 |
+
import java.util.stream.Collectors;
|
5 |
+
|
6 |
+
public class SentenceAnnotation extends Annotation {
|
7 |
+
|
8 |
+
// Fields & properties
|
9 |
+
|
10 |
+
private String text;
|
11 |
+
|
12 |
+
public String getText() {
|
13 |
+
return text;
|
14 |
+
}
|
15 |
+
|
16 |
+
public void setText(String text) {
|
17 |
+
this.text = text;
|
18 |
+
}
|
19 |
+
|
20 |
+
private int startPosition;
|
21 |
+
|
22 |
+
public int getStartPosition() {
|
23 |
+
return startPosition;
|
24 |
+
}
|
25 |
+
|
26 |
+
public void setStartPosition(int startPosition) {
|
27 |
+
this.startPosition = startPosition;
|
28 |
+
}
|
29 |
+
|
30 |
+
public int getEndPosition()
|
31 |
+
{
|
32 |
+
return text != null ? startPosition + text.length() - 1 : startPosition;
|
33 |
+
}
|
34 |
+
|
35 |
+
// Lazy properties
|
36 |
+
|
37 |
+
private List<TokenAnnotation> tokens;
|
38 |
+
public List<TokenAnnotation> getTokens()
|
39 |
+
{
|
40 |
+
if (tokens == null) tokens = loadTokens();
|
41 |
+
return tokens;
|
42 |
+
}
|
43 |
+
|
44 |
+
public void setTokens(List<TokenAnnotation> tokens)
|
45 |
+
{
|
46 |
+
this.tokens = tokens;
|
47 |
+
}
|
48 |
+
|
49 |
+
private List<TokenAnnotation> loadTokens()
|
50 |
+
{
|
51 |
+
if (childAnnotations.containsKey(AnnotationType.TokenAnnotation)) return childAnnotations.get(AnnotationType.TokenAnnotation).stream().map(x -> (TokenAnnotation)x).collect(Collectors.toList());
|
52 |
+
else return null;
|
53 |
+
}
|
54 |
+
|
55 |
+
|
56 |
+
// Ctors
|
57 |
+
public SentenceAnnotation()
|
58 |
+
{
|
59 |
+
}
|
60 |
+
|
61 |
+
public SentenceAnnotation(String text, int startPosition)
|
62 |
+
{
|
63 |
+
this.text = text;
|
64 |
+
this.startPosition = startPosition;
|
65 |
+
}
|
66 |
+
}
|
source/src/edu/uma/nlp/graphseg/preprocessing/StanfordAnnotator.java
ADDED
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
package edu.uma.nlp.graphseg.preprocessing;
|
2 |
+
|
3 |
+
import java.util.ArrayList;
|
4 |
+
import java.util.Arrays;
|
5 |
+
import java.util.List;
|
6 |
+
import java.util.Properties;
|
7 |
+
|
8 |
+
import edu.stanford.nlp.ling.CoreAnnotations.CharacterOffsetBeginAnnotation;
|
9 |
+
import edu.stanford.nlp.ling.CoreAnnotations.NamedEntityTagAnnotation;
|
10 |
+
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
|
11 |
+
import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
|
12 |
+
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
|
13 |
+
import edu.stanford.nlp.ling.CoreLabel;
|
14 |
+
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
|
15 |
+
import edu.stanford.nlp.util.CoreMap;
|
16 |
+
|
17 |
+
public class StanfordAnnotator implements IAnnotator {
|
18 |
+
|
19 |
+
private List<String> stanfordAnnotators;
|
20 |
+
private String stanfordAnnotatorsString;
|
21 |
+
|
22 |
+
public void setStanfordAnnotators(List<String> stanfordAnnotators) {
|
23 |
+
this.stanfordAnnotators = stanfordAnnotators;
|
24 |
+
|
25 |
+
stanfordAnnotatorsString = "";
|
26 |
+
for(int i = 0; i < this.stanfordAnnotators.size(); i++)
|
27 |
+
{
|
28 |
+
if (i == 0) stanfordAnnotatorsString += this.stanfordAnnotators.get(i);
|
29 |
+
else stanfordAnnotatorsString += ", " + this.stanfordAnnotators.get(i);
|
30 |
+
}
|
31 |
+
}
|
32 |
+
|
33 |
+
@Override
|
34 |
+
public void annotate(Annotation textUnit)
|
35 |
+
{
|
36 |
+
if (textUnit instanceof Document)
|
37 |
+
{
|
38 |
+
Properties props = new Properties();
|
39 |
+
props.setProperty("annotators", stanfordAnnotatorsString);
|
40 |
+
|
41 |
+
StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
|
42 |
+
|
43 |
+
edu.stanford.nlp.pipeline.Annotation docAnnotation = new edu.stanford.nlp.pipeline.Annotation(((Document)textUnit).getText());
|
44 |
+
pipeline.annotate(docAnnotation);
|
45 |
+
|
46 |
+
List<CoreMap> sentences = docAnnotation.get(SentencesAnnotation.class);
|
47 |
+
|
48 |
+
for(CoreMap stanfordSentence : sentences)
|
49 |
+
{
|
50 |
+
SentenceAnnotation sentence = new SentenceAnnotation();
|
51 |
+
sentence.setText(stanfordSentence.get(TextAnnotation.class));
|
52 |
+
sentence.setStartPosition(stanfordSentence.get(CharacterOffsetBeginAnnotation.class));
|
53 |
+
|
54 |
+
for (CoreLabel stanfordToken: stanfordSentence.get(TokensAnnotation.class))
|
55 |
+
{
|
56 |
+
TokenAnnotation token = new TokenAnnotation(stanfordToken.get(TextAnnotation.class));
|
57 |
+
token.setStartPosition(stanfordToken.beginPosition());
|
58 |
+
token.setSentenceIndex(stanfordToken.sentIndex());
|
59 |
+
|
60 |
+
if (stanfordAnnotators.contains("lemma"))
|
61 |
+
{
|
62 |
+
token.setLemma(stanfordToken.lemma());
|
63 |
+
}
|
64 |
+
|
65 |
+
if (stanfordAnnotators.contains("pos"))
|
66 |
+
{
|
67 |
+
PartOfSpeechAnnotation posAnnotation = new PartOfSpeechAnnotation();
|
68 |
+
posAnnotation.setTag(stanfordToken.get(edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation.class));
|
69 |
+
token.addChildAnnotation(posAnnotation, AnnotationType.PartOfSpeechAnnotation);
|
70 |
+
}
|
71 |
+
|
72 |
+
if (stanfordAnnotators.contains("ner"))
|
73 |
+
{
|
74 |
+
NamedEntityTokenAnnotation neta = new NamedEntityTokenAnnotation(stanfordToken.get(NamedEntityTagAnnotation.class));
|
75 |
+
token.addChildAnnotation(neta, AnnotationType.NamedEntityTokenAnnotation);
|
76 |
+
}
|
77 |
+
|
78 |
+
sentence.addChildAnnotation(token, AnnotationType.TokenAnnotation);
|
79 |
+
}
|
80 |
+
|
81 |
+
// linking continuous token-level NE annotations into whole named entity annotations
|
82 |
+
if (stanfordAnnotators.contains("ner"))
|
83 |
+
{
|
84 |
+
List<NamedEntityAnnotation> nes = new ArrayList<NamedEntityAnnotation>();
|
85 |
+
NamedEntityAnnotation ne = null;
|
86 |
+
for(int i = 0; i < sentence.getTokens().size(); i++)
|
87 |
+
{
|
88 |
+
String neLabel = sentence.getTokens().get(i).getNamedEntityLabel().getNamedEntityLabel();
|
89 |
+
String neLabelPrevious = i > 0 ? sentence.getTokens().get(i-1).getNamedEntityLabel().getNamedEntityLabel() : "O";
|
90 |
+
|
91 |
+
if (neLabel.compareTo("O") == 0)
|
92 |
+
{
|
93 |
+
if (ne != null)
|
94 |
+
{
|
95 |
+
nes.add(ne);
|
96 |
+
ne = null;
|
97 |
+
}
|
98 |
+
}
|
99 |
+
else if (neLabel.compareTo(neLabelPrevious) != 0)
|
100 |
+
{
|
101 |
+
NamedEntityType type = Arrays.stream(NamedEntityType.values()).filter(e -> e.name().equalsIgnoreCase(neLabel)).findAny().orElse(null);
|
102 |
+
if (type == null)
|
103 |
+
{
|
104 |
+
throw new UnsupportedOperationException("Unknown named entity type!");
|
105 |
+
}
|
106 |
+
|
107 |
+
ne = new NamedEntityAnnotation(type);
|
108 |
+
ne.setStartPosition(sentence.getTokens().get(i).getStartPosition());
|
109 |
+
ne.addChildAnnotation(sentence.getTokens().get(i), AnnotationType.TokenAnnotation);
|
110 |
+
|
111 |
+
}
|
112 |
+
else
|
113 |
+
{
|
114 |
+
ne.addChildAnnotation(sentence.getTokens().get(i), AnnotationType.TokenAnnotation);
|
115 |
+
}
|
116 |
+
}
|
117 |
+
if (ne != null) nes.add(ne);
|
118 |
+
|
119 |
+
nes.forEach(n -> textUnit.addChildAnnotation(n, AnnotationType.NamedEntityAnnotation));
|
120 |
+
}
|
121 |
+
|
122 |
+
textUnit.addChildAnnotation(sentence, AnnotationType.SentenceAnnotation);
|
123 |
+
}
|
124 |
+
|
125 |
+
// coreference, crosses sentence borders
|
126 |
+
if (stanfordAnnotators.contains("dcoref"))
|
127 |
+
{
|
128 |
+
// TODO: coref annotations
|
129 |
+
}
|
130 |
+
}
|
131 |
+
else throw new UnsupportedOperationException("Only whole documents can be processed by Stanford's CoreNLP pipeline");
|
132 |
+
}
|
133 |
+
|
134 |
+
@Override
|
135 |
+
public List<Annotation> annotate(String text)
|
136 |
+
{
|
137 |
+
Document document = new Document(text);
|
138 |
+
annotate(document);
|
139 |
+
|
140 |
+
return new ArrayList<Annotation>(Arrays.asList(document));
|
141 |
+
}
|
142 |
+
}
|
source/src/edu/uma/nlp/graphseg/preprocessing/TokenAnnotation.java
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
package edu.uma.nlp.graphseg.preprocessing;
|
2 |
+
|
3 |
+
public class TokenAnnotation extends Annotation {
|
4 |
+
|
5 |
+
// Fields & properties
|
6 |
+
|
7 |
+
private String text;
|
8 |
+
|
9 |
+
public String getText() {
|
10 |
+
return text;
|
11 |
+
}
|
12 |
+
public void setText(String text) {
|
13 |
+
this.text = text;
|
14 |
+
}
|
15 |
+
|
16 |
+
private String lemma;
|
17 |
+
|
18 |
+
public String getLemma() {
|
19 |
+
return lemma;
|
20 |
+
}
|
21 |
+
public void setLemma(String lemma) {
|
22 |
+
this.lemma = lemma;
|
23 |
+
}
|
24 |
+
|
25 |
+
private int startPosition;
|
26 |
+
|
27 |
+
public int getStartPosition() {
|
28 |
+
return startPosition;
|
29 |
+
}
|
30 |
+
|
31 |
+
public void setStartPosition(int startPosition) {
|
32 |
+
this.startPosition = startPosition;
|
33 |
+
}
|
34 |
+
|
35 |
+
public int getEndPosition() {
|
36 |
+
return text != null ? startPosition + text.length() - 1 : startPosition;
|
37 |
+
}
|
38 |
+
|
39 |
+
private int startPositionSentence;
|
40 |
+
|
41 |
+
public int getStartPositionSentence() {
|
42 |
+
return startPositionSentence;
|
43 |
+
}
|
44 |
+
|
45 |
+
public void setStartPositionSentence(int startPositionSentence) {
|
46 |
+
this.startPositionSentence = startPositionSentence;
|
47 |
+
}
|
48 |
+
|
49 |
+
public int getEndPositionSentence() {
|
50 |
+
return text != null ? startPositionSentence + text.length() - 1 : startPositionSentence;
|
51 |
+
}
|
52 |
+
|
53 |
+
private int sentenceIndex;
|
54 |
+
|
55 |
+
public int getSentenceIndex() {
|
56 |
+
return sentenceIndex;
|
57 |
+
}
|
58 |
+
|
59 |
+
public void setSentenceIndex(int sentenceIndex) {
|
60 |
+
this.sentenceIndex = sentenceIndex;
|
61 |
+
}
|
62 |
+
|
63 |
+
// Lazy loading properties
|
64 |
+
|
65 |
+
private PartOfSpeechAnnotation partOfSpeech;
|
66 |
+
public PartOfSpeechAnnotation getPartOfSpeech()
|
67 |
+
{
|
68 |
+
if (partOfSpeech == null) partOfSpeech = loadPartOfSpeech();
|
69 |
+
return partOfSpeech;
|
70 |
+
}
|
71 |
+
|
72 |
+
|
73 |
+
private NamedEntityTokenAnnotation namedEntityLabel;
|
74 |
+
public NamedEntityTokenAnnotation getNamedEntityLabel()
|
75 |
+
{
|
76 |
+
if (namedEntityLabel == null) namedEntityLabel = loadTokenNELabel();
|
77 |
+
return namedEntityLabel;
|
78 |
+
}
|
79 |
+
|
80 |
+
private PartOfSpeechAnnotation loadPartOfSpeech()
|
81 |
+
{
|
82 |
+
if (!childAnnotations.containsKey(AnnotationType.PartOfSpeechAnnotation)) this.addChildAnnotation(new PartOfSpeechAnnotation(), AnnotationType.PartOfSpeechAnnotation);
|
83 |
+
return ((PartOfSpeechAnnotation)(getChildAnnotations(AnnotationType.PartOfSpeechAnnotation).get(0)));
|
84 |
+
}
|
85 |
+
|
86 |
+
private NamedEntityTokenAnnotation loadTokenNELabel()
|
87 |
+
{
|
88 |
+
if (!childAnnotations.containsKey(AnnotationType.NamedEntityTokenAnnotation)) return null; //this.addChildAnnotation(new NamedEntityTokenAnnotation(), AnnotationType.NamedEntityTokenAnnotation);
|
89 |
+
else return ((NamedEntityTokenAnnotation)(childAnnotations.get(AnnotationType.NamedEntityTokenAnnotation).get(0)));
|
90 |
+
}
|
91 |
+
|
92 |
+
public TokenAnnotation(String text, int startPosition, int startPositionSentence, int sentenceIndex)
|
93 |
+
{
|
94 |
+
this.text = text;
|
95 |
+
this.startPosition = startPosition;
|
96 |
+
this.startPositionSentence = startPositionSentence;
|
97 |
+
this.sentenceIndex = sentenceIndex;
|
98 |
+
}
|
99 |
+
|
100 |
+
public TokenAnnotation(String text)
|
101 |
+
{
|
102 |
+
this.text = text;
|
103 |
+
}
|
104 |
+
}
|
source/src/edu/uma/nlp/graphseg/semantics/InformationContent.java
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
package edu.uma.nlp.graphseg.semantics;
|
2 |
+
|
3 |
+
import java.io.IOException;
|
4 |
+
import java.io.InputStream;
|
5 |
+
import java.io.UnsupportedEncodingException;
|
6 |
+
import java.util.HashMap;
|
7 |
+
import java.util.List;
|
8 |
+
|
9 |
+
import edu.uma.nlp.graphseg.utils.IOHelper;
|
10 |
+
|
11 |
+
public class InformationContent {
|
12 |
+
private HashMap<String, Integer> frequencies = new HashMap<String, Integer>();
|
13 |
+
private double sumFrequencies = 0;
|
14 |
+
private double minFreq = 1;
|
15 |
+
private double divideFactor = 1;
|
16 |
+
|
17 |
+
public InformationContent(String path, double divideFactor)
|
18 |
+
{
|
19 |
+
this.divideFactor = divideFactor;
|
20 |
+
|
21 |
+
frequencies = IOHelper.loadCounts(path);
|
22 |
+
sumFrequencies = frequencies.values().stream().mapToDouble(x -> ((double)x)/divideFactor).sum();
|
23 |
+
minFreq = frequencies.values().stream().mapToDouble(x -> ((double)x)/divideFactor).min().getAsDouble();
|
24 |
+
if (minFreq == 0) minFreq = 1;
|
25 |
+
}
|
26 |
+
|
27 |
+
public InformationContent(InputStream stream, double divideFactor) throws UnsupportedEncodingException, IOException
|
28 |
+
{
|
29 |
+
this.divideFactor = divideFactor;
|
30 |
+
|
31 |
+
frequencies = IOHelper.loadCounts(stream);
|
32 |
+
sumFrequencies = frequencies.values().stream().mapToDouble(x -> ((double)x)/divideFactor).sum();
|
33 |
+
minFreq = frequencies.values().stream().mapToDouble(x -> ((double)x)/divideFactor).min().getAsDouble();
|
34 |
+
if (minFreq == 0) minFreq = 1;
|
35 |
+
}
|
36 |
+
|
37 |
+
public InformationContent(HashMap<String, Integer> frequenciesDictionary, double divideFactor)
|
38 |
+
{
|
39 |
+
this.divideFactor = divideFactor;
|
40 |
+
|
41 |
+
frequencies = frequenciesDictionary;
|
42 |
+
sumFrequencies = frequencies.values().stream().mapToDouble(x -> ((double)x)/divideFactor).sum();
|
43 |
+
minFreq = frequencies.values().stream().mapToDouble(x -> ((double)x)/divideFactor).min().getAsDouble();
|
44 |
+
}
|
45 |
+
|
46 |
+
public double getInformationContent(String word)
|
47 |
+
{
|
48 |
+
if (frequencies.containsKey(word.toLowerCase())) return (-1) * Math.log(((((double)frequencies.get(word.toLowerCase())) + minFreq) / divideFactor) / sumFrequencies);
|
49 |
+
else return (-1) * Math.log((minFreq / divideFactor) / sumFrequencies);
|
50 |
+
}
|
51 |
+
|
52 |
+
public double getRelativeInformationContent(String word)
|
53 |
+
{
|
54 |
+
double maxInfCont = (-1) * Math.log((minFreq / divideFactor) / sumFrequencies);
|
55 |
+
double infCont = (frequencies.containsKey(word.toLowerCase())) ? (-1) * Math.log(((((double)frequencies.get(word.toLowerCase())) + minFreq) / divideFactor) / sumFrequencies) : maxInfCont;
|
56 |
+
|
57 |
+
return infCont / maxInfCont;
|
58 |
+
}
|
59 |
+
|
60 |
+
public double getLogRelativeInformationContent(String word)
|
61 |
+
{
|
62 |
+
double maxInfCont = (-1) * Math.log((minFreq / divideFactor) / sumFrequencies);
|
63 |
+
double infCont = (frequencies.containsKey(word.toLowerCase())) ? (-1) * Math.log(((((double)frequencies.get(word.toLowerCase())) + minFreq) / divideFactor) / sumFrequencies) : maxInfCont;
|
64 |
+
|
65 |
+
return Math.log(infCont) / Math.log(maxInfCont);
|
66 |
+
}
|
67 |
+
|
68 |
+
public double getInformationContent(List<String> phrase)
|
69 |
+
{
|
70 |
+
double ic = 1;
|
71 |
+
for(String w : phrase)
|
72 |
+
{
|
73 |
+
ic *= getInformationContent(w);
|
74 |
+
}
|
75 |
+
return ic;
|
76 |
+
}
|
77 |
+
}
|
source/src/edu/uma/nlp/graphseg/semantics/SemanticSimilarity.java
ADDED
@@ -0,0 +1,252 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
package edu.uma.nlp.graphseg.semantics;
|
2 |
+
|
3 |
+
import java.util.ArrayList;
|
4 |
+
import java.util.HashMap;
|
5 |
+
import java.util.List;
|
6 |
+
import java.util.stream.Collectors;
|
7 |
+
|
8 |
+
import org.javatuples.Triplet;
|
9 |
+
|
10 |
+
import edu.uma.nlp.graphseg.preprocessing.TokenAnnotation;
|
11 |
+
import edu.uma.nlp.graphseg.utils.VectorOperations;
|
12 |
+
|
13 |
+
public class SemanticSimilarity {
|
14 |
+
|
15 |
+
private static List<String> stopwords;
|
16 |
+
public static void setStopwords(List<String> stwrds)
|
17 |
+
{
|
18 |
+
stopwords = stwrds;
|
19 |
+
}
|
20 |
+
|
21 |
+
public static double greedyAlignmentOverlapFScore(List<TokenAnnotation> firstPhrase, List<TokenAnnotation> secondPhrase, WordVectorSpace vectorSpace, InformationContent informationContent, Boolean contentWordsOnly)
|
22 |
+
{
|
23 |
+
return greedyAlignmentOverlap(firstPhrase, secondPhrase, vectorSpace, informationContent, contentWordsOnly).getValue2();
|
24 |
+
}
|
25 |
+
|
26 |
+
public static double greedyAlignmentOverlapPrecision(List<TokenAnnotation> firstPhrase, List<TokenAnnotation> secondPhrase, WordVectorSpace vectorSpace, InformationContent informationContent, Boolean contentWordsOnly)
|
27 |
+
{
|
28 |
+
return greedyAlignmentOverlap(firstPhrase, secondPhrase, vectorSpace, informationContent, contentWordsOnly).getValue0();
|
29 |
+
}
|
30 |
+
|
31 |
+
public static double greedyAlignmentOverlapRecall(List<TokenAnnotation> firstPhrase, List<TokenAnnotation> secondPhrase, WordVectorSpace vectorSpace, InformationContent informationContent, Boolean contentWordsOnly)
|
32 |
+
{
|
33 |
+
return greedyAlignmentOverlap(firstPhrase, secondPhrase, vectorSpace, informationContent, contentWordsOnly).getValue1();
|
34 |
+
}
|
35 |
+
|
36 |
+
public static Triplet<Double, Double, Double> greedyAlignmentOverlap(List<TokenAnnotation> firstPhrase, List<TokenAnnotation> secondPhrase, WordVectorSpace vectorSpace, InformationContent informationContent, Boolean contentWordsOnly)
|
37 |
+
{
|
38 |
+
List<TokenAnnotation> firstPhraseCopy = new ArrayList<TokenAnnotation>();
|
39 |
+
List<TokenAnnotation> secondPhraseCopy = new ArrayList<TokenAnnotation>();
|
40 |
+
if (contentWordsOnly)
|
41 |
+
{
|
42 |
+
firstPhraseCopy.addAll(firstPhrase.stream().filter(x -> (contentWordsOnly ? x.getPartOfSpeech().isContent() : 1 != 2)).collect(Collectors.toList()));
|
43 |
+
secondPhraseCopy.addAll(secondPhrase.stream().filter(x -> (contentWordsOnly ? x.getPartOfSpeech().isContent() : 1 != 2)).collect(Collectors.toList()));
|
44 |
+
}
|
45 |
+
else
|
46 |
+
{
|
47 |
+
firstPhraseCopy.addAll(firstPhrase);
|
48 |
+
secondPhraseCopy.addAll(secondPhrase);
|
49 |
+
}
|
50 |
+
|
51 |
+
if (stopwords != null && stopwords.size() > 0)
|
52 |
+
{
|
53 |
+
firstPhraseCopy = firstPhraseCopy.stream().filter(t -> !stopwords.contains(t.getLemma().toLowerCase()) && !stopwords.contains(t.getText().toLowerCase())).collect(Collectors.toList());
|
54 |
+
secondPhraseCopy = secondPhraseCopy.stream().filter(t -> !stopwords.contains(t.getLemma().toLowerCase()) && !stopwords.contains(t.getText().toLowerCase())).collect(Collectors.toList());
|
55 |
+
}
|
56 |
+
|
57 |
+
List<Double> pairSimilarities = new ArrayList<Double>();
|
58 |
+
while(firstPhraseCopy.size() > 0 && secondPhraseCopy.size() > 0)
|
59 |
+
{
|
60 |
+
double maxSim = -1;
|
61 |
+
TokenAnnotation firstToken = null;
|
62 |
+
TokenAnnotation secondToken = null;
|
63 |
+
for(TokenAnnotation nf : firstPhraseCopy)
|
64 |
+
{
|
65 |
+
for(TokenAnnotation ns : secondPhraseCopy)
|
66 |
+
{
|
67 |
+
double sim = vectorSpace.similarity(nf.getText().toLowerCase(), ns.getText().toLowerCase());
|
68 |
+
if (sim < 0) sim = 0;
|
69 |
+
|
70 |
+
if (sim > maxSim)
|
71 |
+
{
|
72 |
+
firstToken = nf;
|
73 |
+
secondToken = ns;
|
74 |
+
maxSim = sim;
|
75 |
+
}
|
76 |
+
}
|
77 |
+
}
|
78 |
+
|
79 |
+
if (informationContent != null)
|
80 |
+
{
|
81 |
+
pairSimilarities.add(maxSim * Math.max(informationContent.getInformationContent(firstToken.getText().toLowerCase()), informationContent.getInformationContent(secondToken.getText().toLowerCase())));
|
82 |
+
}
|
83 |
+
else pairSimilarities.add(maxSim);
|
84 |
+
|
85 |
+
firstPhraseCopy.remove(firstToken);
|
86 |
+
secondPhraseCopy.remove(secondToken);
|
87 |
+
}
|
88 |
+
|
89 |
+
double precision = 0;
|
90 |
+
double recall = 0;
|
91 |
+
double overlap = pairSimilarities.stream().mapToDouble(s -> s).sum();
|
92 |
+
|
93 |
+
if (informationContent != null)
|
94 |
+
{
|
95 |
+
double infContentFirst = contentWordsOnly ?
|
96 |
+
firstPhrase.stream().filter(x -> x.getPartOfSpeech().isContent()).mapToDouble(t -> informationContent.getInformationContent(t.getText().toLowerCase())).sum() :
|
97 |
+
firstPhrase.stream().mapToDouble(t -> informationContent.getInformationContent(t.getText().toLowerCase())).sum();
|
98 |
+
|
99 |
+
double infContentSecond = contentWordsOnly ?
|
100 |
+
secondPhrase.stream().filter(x -> x.getPartOfSpeech().isContent()).mapToDouble(t -> informationContent.getInformationContent(t.getText().toLowerCase())).sum() :
|
101 |
+
secondPhrase.stream().mapToDouble(t -> informationContent.getInformationContent(t.getText().toLowerCase())).sum();
|
102 |
+
|
103 |
+
precision = overlap / infContentFirst;
|
104 |
+
recall = overlap / infContentSecond;
|
105 |
+
}
|
106 |
+
else
|
107 |
+
{
|
108 |
+
precision = overlap / firstPhrase.size();
|
109 |
+
recall = overlap / secondPhrase.size();
|
110 |
+
}
|
111 |
+
|
112 |
+
double fScore = 0;
|
113 |
+
if (precision == 0 && recall == 0) fScore = 0;
|
114 |
+
else fScore = (2 * precision * recall) / (precision + recall);
|
115 |
+
if (Double.isNaN(fScore)) fScore = 0;
|
116 |
+
|
117 |
+
return new Triplet<Double, Double, Double>(precision, recall, fScore);
|
118 |
+
}
|
119 |
+
|
120 |
+
public static double embeddingSumSimilarity(List<TokenAnnotation> first, List<TokenAnnotation> second, WordVectorSpace vectorSpace, int embeddingLength, Boolean content, List<InformationContent> infContents)
|
121 |
+
{
|
122 |
+
double[] embeddingFirst = new double[embeddingLength];
|
123 |
+
double[] embeddingSecond = new double[embeddingLength];
|
124 |
+
|
125 |
+
if (content)
|
126 |
+
{
|
127 |
+
first = first.stream().filter(x -> x.getPartOfSpeech().isContent()).collect(Collectors.toList());
|
128 |
+
second = second.stream().filter(x -> x.getPartOfSpeech().isContent()).collect(Collectors.toList());
|
129 |
+
}
|
130 |
+
|
131 |
+
first.forEach(x ->
|
132 |
+
{
|
133 |
+
double[] wordEmbedding = vectorSpace.getEmbedding(x.getText().trim());
|
134 |
+
if (wordEmbedding == null)
|
135 |
+
{
|
136 |
+
wordEmbedding = vectorSpace.getEmbedding(x.getText().trim().toLowerCase());
|
137 |
+
}
|
138 |
+
if (wordEmbedding != null)
|
139 |
+
{
|
140 |
+
double ic = 1;
|
141 |
+
for(InformationContent inco : infContents)
|
142 |
+
{
|
143 |
+
ic *= inco.getInformationContent(x.getText().trim().toLowerCase());
|
144 |
+
};
|
145 |
+
VectorOperations.multiply(wordEmbedding, ic);
|
146 |
+
VectorOperations.addVector(embeddingFirst, wordEmbedding);
|
147 |
+
}
|
148 |
+
});
|
149 |
+
|
150 |
+
second.forEach(x ->
|
151 |
+
{
|
152 |
+
double[] wordEmbedding = vectorSpace.getEmbedding(x.getText().trim());
|
153 |
+
if (wordEmbedding == null)
|
154 |
+
{
|
155 |
+
wordEmbedding = vectorSpace.getEmbedding(x.getText().trim().toLowerCase());
|
156 |
+
}
|
157 |
+
if (wordEmbedding != null)
|
158 |
+
{
|
159 |
+
double ic = 1;
|
160 |
+
for(InformationContent inco : infContents)
|
161 |
+
{
|
162 |
+
ic *= inco.getInformationContent(x.getText().trim().toLowerCase());
|
163 |
+
};
|
164 |
+
VectorOperations.multiply(wordEmbedding, ic);
|
165 |
+
VectorOperations.addVector(embeddingSecond, wordEmbedding);
|
166 |
+
}
|
167 |
+
});
|
168 |
+
|
169 |
+
double res;
|
170 |
+
try {
|
171 |
+
res = VectorOperations.cosine(embeddingFirst, embeddingSecond);
|
172 |
+
} catch (Exception e) {
|
173 |
+
// TODO Auto-generated catch block
|
174 |
+
e.printStackTrace();
|
175 |
+
res = 0;
|
176 |
+
}
|
177 |
+
if (Double.isNaN(res))
|
178 |
+
{
|
179 |
+
res = 0;
|
180 |
+
}
|
181 |
+
|
182 |
+
return res;
|
183 |
+
}
|
184 |
+
|
185 |
+
public static double averagePhraseGreedyAlignmentOverlap(List<List<TokenAnnotation>> firstPhrases, List<List<TokenAnnotation>> secondPhrases, WordVectorSpace vectorSpace, InformationContent informationContent, Boolean contentWordsOnly)
|
186 |
+
{
|
187 |
+
double sum = 0;
|
188 |
+
double counter = 0;
|
189 |
+
|
190 |
+
for(List<TokenAnnotation> fp : firstPhrases){
|
191 |
+
for(List<TokenAnnotation> sp : secondPhrases){
|
192 |
+
double sim = greedyAlignmentOverlapFScore(fp, sp, vectorSpace, informationContent, contentWordsOnly);
|
193 |
+
sum += sim;
|
194 |
+
counter++;
|
195 |
+
}
|
196 |
+
}
|
197 |
+
|
198 |
+
double score = sum/counter;
|
199 |
+
if (Double.isNaN(score) || Double.isInfinite(score)) return 0;
|
200 |
+
else return score;
|
201 |
+
}
|
202 |
+
|
203 |
+
public static double maxPhraseGreedyAlignmentOverlap(List<List<TokenAnnotation>> firstPhrases, List<List<TokenAnnotation>> secondPhrases, WordVectorSpace vectorSpace, InformationContent informationContent, Boolean contentWordsOnly)
|
204 |
+
{
|
205 |
+
double maxSim = 0;
|
206 |
+
|
207 |
+
for(List<TokenAnnotation> fp : firstPhrases){
|
208 |
+
for(List<TokenAnnotation> sp : secondPhrases){
|
209 |
+
double sim = greedyAlignmentOverlapFScore(fp, sp, vectorSpace, informationContent, contentWordsOnly);
|
210 |
+
if (sim > maxSim)
|
211 |
+
{
|
212 |
+
maxSim = sim;
|
213 |
+
}
|
214 |
+
}
|
215 |
+
}
|
216 |
+
return maxSim;
|
217 |
+
}
|
218 |
+
|
219 |
+
public static int numSufficientlySimilarPhrasesGreedyAlignmentOverlap(List<List<TokenAnnotation>> firstPhrases, List<List<TokenAnnotation>> secondPhrases, double treshold, WordVectorSpace vectorSpace, InformationContent informationContent, Boolean contentWordsOnly)
|
220 |
+
{
|
221 |
+
int counter = 0;
|
222 |
+
|
223 |
+
for(List<TokenAnnotation> fp : firstPhrases){
|
224 |
+
for(List<TokenAnnotation> sp : secondPhrases){
|
225 |
+
double sim = greedyAlignmentOverlapFScore(fp, sp, vectorSpace, informationContent, contentWordsOnly);
|
226 |
+
if (sim >= treshold)
|
227 |
+
{
|
228 |
+
counter++;
|
229 |
+
}
|
230 |
+
}
|
231 |
+
}
|
232 |
+
return counter;
|
233 |
+
}
|
234 |
+
|
235 |
+
public static HashMap<String, Double> allToAllSimilarity(WordVectorSpace vectorSpace, List<String> vocabulary)
|
236 |
+
{
|
237 |
+
HashMap<String, Double> similarities = new HashMap<String, Double>();
|
238 |
+
for(int i = 0; i < vocabulary.size() - 1; i++)
|
239 |
+
{
|
240 |
+
if (i % 100 == 0) System.out.println("Outer loop: " + String.valueOf(i + 1) + "/" + String.valueOf(vocabulary.size() - 1));
|
241 |
+
for(int j = i+1; j < vocabulary.size(); j++)
|
242 |
+
{
|
243 |
+
double sim = vectorSpace.similarity(vocabulary.get(i), vocabulary.get(j));
|
244 |
+
if (sim >= -1)
|
245 |
+
{
|
246 |
+
similarities.put(vocabulary.get(i).compareTo(vocabulary.get(j)) < 0 ? vocabulary.get(i) + "<=>" + vocabulary.get(j) : vocabulary.get(j) + "<=>" + vocabulary.get(i), sim);
|
247 |
+
}
|
248 |
+
}
|
249 |
+
}
|
250 |
+
return similarities;
|
251 |
+
}
|
252 |
+
}
|
source/src/edu/uma/nlp/graphseg/semantics/WordVectorSpace.java
ADDED
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
package edu.uma.nlp.graphseg.semantics;
|
2 |
+
|
3 |
+
import java.io.BufferedReader;
|
4 |
+
import java.io.BufferedWriter;
|
5 |
+
import java.io.File;
|
6 |
+
import java.io.FileNotFoundException;
|
7 |
+
import java.io.FileOutputStream;
|
8 |
+
import java.io.FileReader;
|
9 |
+
import java.io.IOException;
|
10 |
+
import java.io.OutputStreamWriter;
|
11 |
+
import java.util.ArrayList;
|
12 |
+
import java.util.HashMap;
|
13 |
+
import java.util.List;
|
14 |
+
|
15 |
+
import org.javatuples.Pair;
|
16 |
+
|
17 |
+
import edu.uma.nlp.graphseg.utils.VectorOperations;
|
18 |
+
|
19 |
+
public class WordVectorSpace {
|
20 |
+
|
21 |
+
private HashMap<String, double[]> embeddings;
|
22 |
+
private int dimension;
|
23 |
+
|
24 |
+
public int getDimension() {
|
25 |
+
return dimension;
|
26 |
+
}
|
27 |
+
|
28 |
+
public void load(String path, HashMap<String, Integer> filters) throws FileNotFoundException, IOException
|
29 |
+
{
|
30 |
+
embeddings = new HashMap<String, double[]>();
|
31 |
+
|
32 |
+
try (BufferedReader br = new BufferedReader(new FileReader(path))) {
|
33 |
+
String line;
|
34 |
+
int counter = 0;
|
35 |
+
while ((line = br.readLine()) != null) {
|
36 |
+
try
|
37 |
+
{
|
38 |
+
String split[] = line.trim().split("\\s+");
|
39 |
+
|
40 |
+
if (filters == null || filters.containsKey(split[0].toLowerCase()))
|
41 |
+
{
|
42 |
+
dimension = split.length - 1;
|
43 |
+
|
44 |
+
if (!embeddings.containsKey(split[0])) embeddings.put(split[0], new double[split.length - 1]);
|
45 |
+
for (int i = 1; i < split.length; i++)
|
46 |
+
{
|
47 |
+
embeddings.get(split[0])[i - 1] = Double.parseDouble(split[i]);
|
48 |
+
}
|
49 |
+
}
|
50 |
+
counter++;
|
51 |
+
if (counter % 1000 == 0)
|
52 |
+
{
|
53 |
+
System.out.println("Loading vectors... " + String.valueOf(counter));
|
54 |
+
}
|
55 |
+
}
|
56 |
+
catch(Exception e)
|
57 |
+
{
|
58 |
+
System.out.println("Error processing line!");
|
59 |
+
continue;
|
60 |
+
};
|
61 |
+
}
|
62 |
+
}
|
63 |
+
}
|
64 |
+
|
65 |
+
public void save(String path) throws Exception
|
66 |
+
{
|
67 |
+
File fout = new File(path);
|
68 |
+
FileOutputStream fos = new FileOutputStream(fout);
|
69 |
+
|
70 |
+
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(fos));
|
71 |
+
|
72 |
+
embeddings.forEach((key, value) -> {
|
73 |
+
try {
|
74 |
+
writer.write(key + " ");
|
75 |
+
} catch (Exception e1) {
|
76 |
+
// TODO Auto-generated catch block
|
77 |
+
e1.printStackTrace();
|
78 |
+
}
|
79 |
+
for(int i = 0; i < value.length; i++)
|
80 |
+
{
|
81 |
+
try {
|
82 |
+
writer.write(String.valueOf(value[i]) + " ");
|
83 |
+
} catch (Exception e) {
|
84 |
+
// TODO Auto-generated catch block
|
85 |
+
e.printStackTrace();
|
86 |
+
}
|
87 |
+
}
|
88 |
+
try {
|
89 |
+
writer.newLine();
|
90 |
+
} catch (Exception e) {
|
91 |
+
// TODO Auto-generated catch block
|
92 |
+
e.printStackTrace();
|
93 |
+
}
|
94 |
+
});
|
95 |
+
writer.close();
|
96 |
+
}
|
97 |
+
|
98 |
+
public double similarity(String word1, String word2)
|
99 |
+
{
|
100 |
+
if (word1.compareTo(word2) == 0) return 1;
|
101 |
+
if (embeddings.containsKey(word1) && embeddings.containsKey(word2))
|
102 |
+
{
|
103 |
+
try {
|
104 |
+
return VectorOperations.cosine(embeddings.get(word1), embeddings.get(word2));
|
105 |
+
} catch (Exception e) {
|
106 |
+
// TODO Auto-generated catch block
|
107 |
+
return -2;
|
108 |
+
}
|
109 |
+
}
|
110 |
+
else return -2;
|
111 |
+
}
|
112 |
+
|
113 |
+
public double[] getEmbedding(String word)
|
114 |
+
{
|
115 |
+
if (embeddings.containsKey(word)) return embeddings.get(word);
|
116 |
+
else return null;
|
117 |
+
}
|
118 |
+
|
119 |
+
public List<Pair<String, Double>> getMostSimilar(String word, int numMostSimilar)
|
120 |
+
{
|
121 |
+
List<Pair<String, Double>> mostSimilar = new ArrayList<Pair<String, Double>>();
|
122 |
+
if (embeddings.containsKey(word))
|
123 |
+
{
|
124 |
+
embeddings.forEach((key, val) -> {
|
125 |
+
if (key.trim() != word)
|
126 |
+
{
|
127 |
+
double sim;
|
128 |
+
try {
|
129 |
+
sim = VectorOperations.cosine(embeddings.get(word), val);
|
130 |
+
} catch (Exception e) {
|
131 |
+
// TODO Auto-generated catch block
|
132 |
+
sim = -2;
|
133 |
+
}
|
134 |
+
if (mostSimilar.size() < numMostSimilar)
|
135 |
+
{
|
136 |
+
mostSimilar.add(new Pair<String, Double>(key, sim));
|
137 |
+
mostSimilar.sort((x,y) -> x.getValue1() > y.getValue1() ? -1 : (x.getValue1() < y.getValue1() ? 1 : 0));
|
138 |
+
}
|
139 |
+
else if (sim > mostSimilar.get(mostSimilar.size() - 1).getValue1())
|
140 |
+
{
|
141 |
+
mostSimilar.set(mostSimilar.size() - 1, new Pair<String, Double>(key, sim));
|
142 |
+
mostSimilar.sort((x,y) -> x.getValue1() > y.getValue1() ? -1 : (x.getValue1() < y.getValue1() ? 1 : 0));
|
143 |
+
}
|
144 |
+
}
|
145 |
+
});
|
146 |
+
|
147 |
+
return mostSimilar;
|
148 |
+
}
|
149 |
+
else return null;
|
150 |
+
}
|
151 |
+
}
|
source/src/edu/uma/nlp/graphseg/utils/ApplicationConfiguration.java
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
package edu.uma.nlp.graphseg.utils;
|
2 |
+
|
3 |
+
import java.io.IOException;
|
4 |
+
import java.io.InputStream;
|
5 |
+
import java.util.Properties;
|
6 |
+
|
7 |
+
public class ApplicationConfiguration {
|
8 |
+
|
9 |
+
public static ApplicationConfiguration config = new ApplicationConfiguration();
|
10 |
+
|
11 |
+
private Properties prop;
|
12 |
+
|
13 |
+
public ApplicationConfiguration()
|
14 |
+
{
|
15 |
+
prop = new Properties();
|
16 |
+
InputStream inStream = getClass().getClassLoader().getResourceAsStream("config.properties");
|
17 |
+
|
18 |
+
if (inStream != null)
|
19 |
+
{
|
20 |
+
try
|
21 |
+
{
|
22 |
+
prop.load(inStream);
|
23 |
+
|
24 |
+
}
|
25 |
+
catch (IOException e) {
|
26 |
+
e.printStackTrace();
|
27 |
+
}
|
28 |
+
finally
|
29 |
+
{
|
30 |
+
try
|
31 |
+
{
|
32 |
+
inStream.close();
|
33 |
+
}
|
34 |
+
catch (IOException e) {
|
35 |
+
e.printStackTrace();
|
36 |
+
}
|
37 |
+
}
|
38 |
+
}
|
39 |
+
}
|
40 |
+
|
41 |
+
public String getValue(String key)
|
42 |
+
{
|
43 |
+
if (prop != null)
|
44 |
+
{
|
45 |
+
return prop.getProperty(key);
|
46 |
+
}
|
47 |
+
else return null;
|
48 |
+
}
|
49 |
+
}
|
source/src/edu/uma/nlp/graphseg/utils/IOHelper.java
ADDED
@@ -0,0 +1,385 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
package edu.uma.nlp.graphseg.utils;
|
2 |
+
|
3 |
+
import java.io.BufferedReader;
|
4 |
+
import java.io.BufferedWriter;
|
5 |
+
import java.io.File;
|
6 |
+
import java.io.FileNotFoundException;
|
7 |
+
import java.io.FileOutputStream;
|
8 |
+
import java.io.FileReader;
|
9 |
+
import java.io.IOException;
|
10 |
+
import java.io.InputStream;
|
11 |
+
import java.io.InputStreamReader;
|
12 |
+
import java.io.OutputStreamWriter;
|
13 |
+
import java.io.UnsupportedEncodingException;
|
14 |
+
import java.nio.file.Files;
|
15 |
+
import java.nio.file.Paths;
|
16 |
+
import java.util.ArrayList;
|
17 |
+
import java.util.Collections;
|
18 |
+
import java.util.HashMap;
|
19 |
+
import java.util.List;
|
20 |
+
import java.util.Map;
|
21 |
+
import java.util.stream.Collectors;
|
22 |
+
|
23 |
+
import org.apache.commons.io.FileUtils;
|
24 |
+
import org.apache.commons.lang3.StringUtils;
|
25 |
+
|
26 |
+
public class IOHelper {
|
27 |
+
public static List<String> getAllLines(String path)
|
28 |
+
{
|
29 |
+
try {
|
30 |
+
return FileUtils.readLines(new File(path));
|
31 |
+
} catch (IOException e) {
|
32 |
+
System.out.println("File not found or error reading the file: " + path);
|
33 |
+
System.out.println(e.getMessage());
|
34 |
+
return null;
|
35 |
+
}
|
36 |
+
}
|
37 |
+
|
38 |
+
public static List<String> getAllLinesWithoutEmpty(String path)
|
39 |
+
{
|
40 |
+
try {
|
41 |
+
List<String> alllines = Files.readAllLines(Paths.get(path));
|
42 |
+
List<String> noEmpty = new ArrayList<String>();
|
43 |
+
|
44 |
+
for(int i = 0; i < alllines.size(); i++)
|
45 |
+
{
|
46 |
+
if (!StringUtils.isEmpty(alllines.get(i).trim()))
|
47 |
+
{
|
48 |
+
noEmpty.add(alllines.get(i));
|
49 |
+
}
|
50 |
+
}
|
51 |
+
|
52 |
+
return noEmpty;
|
53 |
+
|
54 |
+
} catch (IOException e) {
|
55 |
+
System.out.println("File not found or error reading the file: " + path);
|
56 |
+
return null;
|
57 |
+
}
|
58 |
+
}
|
59 |
+
|
60 |
+
|
61 |
+
public static void writeLines(List<String> lines, String path)
|
62 |
+
{
|
63 |
+
StringBuilder builder = new StringBuilder();
|
64 |
+
for(int i = 0; i < lines.size(); i++)
|
65 |
+
{
|
66 |
+
builder.append(lines.get(i) + "\n");
|
67 |
+
}
|
68 |
+
|
69 |
+
try {
|
70 |
+
FileUtils.writeStringToFile(new File(path), builder.toString());
|
71 |
+
} catch (IOException e) {
|
72 |
+
// TODO Auto-generated catch block
|
73 |
+
e.printStackTrace();
|
74 |
+
}
|
75 |
+
}
|
76 |
+
|
77 |
+
public static void writeCounts(Map<String, Integer> dictionary, String path, Boolean ordered)
|
78 |
+
{
|
79 |
+
writeCounts(dictionary.entrySet().stream().collect(Collectors.toList()), path, ordered);
|
80 |
+
}
|
81 |
+
|
82 |
+
public static void writeCounts(List<Map.Entry<String, Integer>> entries, String path, Boolean ordered)
|
83 |
+
{
|
84 |
+
try {
|
85 |
+
File fout = new File(path);
|
86 |
+
FileOutputStream fos;
|
87 |
+
fos = new FileOutputStream(fout);
|
88 |
+
BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(fos, "UTF-8"));
|
89 |
+
|
90 |
+
if (ordered) entries.sort((i1, i2) -> i1.getValue() > i2.getValue() ? -1 : (i2.getValue() > i1.getValue() ? 1 : 0));
|
91 |
+
|
92 |
+
for(int i = 0; i < entries.size(); i++)
|
93 |
+
{
|
94 |
+
bw.write(entries.get(i).getKey() + " " + entries.get(i).getValue() + "\n");
|
95 |
+
}
|
96 |
+
|
97 |
+
bw.close();
|
98 |
+
|
99 |
+
} catch (FileNotFoundException e) {
|
100 |
+
// TODO Auto-generated catch block
|
101 |
+
e.printStackTrace();
|
102 |
+
} catch (IOException e) {
|
103 |
+
// TODO Auto-generated catch block
|
104 |
+
e.printStackTrace();
|
105 |
+
}
|
106 |
+
|
107 |
+
}
|
108 |
+
|
109 |
+
public static void writeScores(Map<String, Double> dictionary, String path, Boolean orderedDescending, Map<String, Integer> additionalData, Boolean mweUnderscore)
|
110 |
+
{
|
111 |
+
writeScores(dictionary.entrySet().stream().collect(Collectors.toList()), path, orderedDescending, additionalData, mweUnderscore);
|
112 |
+
}
|
113 |
+
|
114 |
+
public static void writeScores(List<Map.Entry<String, Double>> entries, String path, Boolean orderedDescending, Map<String, Integer> additionalData, Boolean mweUnderscore)
|
115 |
+
{
|
116 |
+
try {
|
117 |
+
BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(path)), "UTF-8"));
|
118 |
+
|
119 |
+
entries.sort((i1, i2) -> i1.getValue() > i2.getValue() ? (orderedDescending ? -1 : 1) : (i2.getValue() > i1.getValue() ? (orderedDescending ? 1 : -1) : 0));
|
120 |
+
|
121 |
+
for(int i = 0; i < entries.size(); i++)
|
122 |
+
{
|
123 |
+
String line = "";
|
124 |
+
if (mweUnderscore)
|
125 |
+
{
|
126 |
+
String[] split = entries.get(i).getKey().split("\\s+");
|
127 |
+
StringBuilder singlePhrase = new StringBuilder();
|
128 |
+
for(int j = 0; j < split.length; j++)
|
129 |
+
{
|
130 |
+
singlePhrase.append(split[j]);
|
131 |
+
if (j < split.length - 1) singlePhrase.append("_");
|
132 |
+
}
|
133 |
+
line += singlePhrase.toString() + " ";
|
134 |
+
}
|
135 |
+
else line = entries.get(i).getKey() + " ";
|
136 |
+
|
137 |
+
line += String.valueOf(entries.get(i).getValue());
|
138 |
+
|
139 |
+
if (additionalData != null)
|
140 |
+
{
|
141 |
+
line += " " + String.valueOf(additionalData.get(entries.get(i).getKey()));
|
142 |
+
}
|
143 |
+
bw.write(line.trim() + "\n");
|
144 |
+
}
|
145 |
+
|
146 |
+
bw.close();
|
147 |
+
|
148 |
+
} catch (FileNotFoundException e) {
|
149 |
+
// TODO Auto-generated catch block
|
150 |
+
e.printStackTrace();
|
151 |
+
} catch (IOException e) {
|
152 |
+
// TODO Auto-generated catch block
|
153 |
+
e.printStackTrace();
|
154 |
+
}
|
155 |
+
|
156 |
+
}
|
157 |
+
|
158 |
+
public static HashMap<String, Integer> loadCounts(String path)
|
159 |
+
{
|
160 |
+
HashMap<String, Integer> dict = new HashMap<String, Integer>();
|
161 |
+
List<String> lines = getAllLines(path);
|
162 |
+
|
163 |
+
for(int i = 0; i < lines.size(); i++)
|
164 |
+
{
|
165 |
+
//if (i % 100 == 0) System.out.println("Loading counts: " + String.valueOf(i+1) + "/" + String.valueOf(lines.size()));
|
166 |
+
String split[] = lines.get(i).split("\\s+");
|
167 |
+
if (!dict.containsKey(split[0]))
|
168 |
+
{
|
169 |
+
dict.put(split[0], Integer.parseInt(split[1]));
|
170 |
+
}
|
171 |
+
}
|
172 |
+
return dict;
|
173 |
+
}
|
174 |
+
|
175 |
+
public static HashMap<String, Integer> loadCounts(InputStream stream) throws UnsupportedEncodingException, IOException
|
176 |
+
{
|
177 |
+
HashMap<String, Integer> dict = new HashMap<String, Integer>();
|
178 |
+
try(BufferedReader br = new BufferedReader(new InputStreamReader(stream, "UTF-8"))) {
|
179 |
+
for(String line; (line = br.readLine()) != null; ) {
|
180 |
+
if (StringUtils.isNotEmpty(line.trim()))
|
181 |
+
{
|
182 |
+
//if (i % 100 == 0) System.out.println("Loading counts: " + String.valueOf(i+1) + "/" + String.valueOf(lines.size()));
|
183 |
+
String split[] = line.split("\\s+");
|
184 |
+
if (!dict.containsKey(split[0]))
|
185 |
+
{
|
186 |
+
dict.put(split[0], Integer.parseInt(split[1]));
|
187 |
+
}
|
188 |
+
}
|
189 |
+
}
|
190 |
+
}
|
191 |
+
return dict;
|
192 |
+
}
|
193 |
+
|
194 |
+
public static HashMap<String, Double> loadScores(String path)
|
195 |
+
{
|
196 |
+
HashMap<String, Double> dict = new HashMap<String, Double>();
|
197 |
+
List<String> lines = getAllLines(path);
|
198 |
+
|
199 |
+
for(int i = 0; i < lines.size(); i++)
|
200 |
+
{
|
201 |
+
//if (i % 100 == 0) System.out.println("Loading counts: " + String.valueOf(i+1) + "/" + String.valueOf(lines.size()));
|
202 |
+
String split[] = lines.get(i).split("\\s+");
|
203 |
+
if (!dict.containsKey(split[0]))
|
204 |
+
{
|
205 |
+
dict.put(split[0], Double.parseDouble(split[1]));
|
206 |
+
}
|
207 |
+
}
|
208 |
+
return dict;
|
209 |
+
}
|
210 |
+
|
211 |
+
public static void peekTopLines(String inputpath, String outputPath, int numLines)
|
212 |
+
{
|
213 |
+
List<String> lines = new ArrayList<String>();
|
214 |
+
try(BufferedReader br = new BufferedReader(new FileReader(inputpath))) {
|
215 |
+
for(int i = 0; i < numLines; i++) {
|
216 |
+
lines.add(br.readLine());
|
217 |
+
}
|
218 |
+
IOHelper.writeLines(lines, outputPath);
|
219 |
+
// line is not visible here.
|
220 |
+
} catch (FileNotFoundException e) {
|
221 |
+
// TODO Auto-generated catch block
|
222 |
+
e.printStackTrace();
|
223 |
+
} catch (IOException e) {
|
224 |
+
// TODO Auto-generated catch block
|
225 |
+
e.printStackTrace();
|
226 |
+
}
|
227 |
+
}
|
228 |
+
|
229 |
+
public static Map<String, Double> loadScoresLineByLine(String path)
|
230 |
+
{
|
231 |
+
Map<String, Double> dict = Collections.synchronizedMap(new HashMap<String, Double>());
|
232 |
+
|
233 |
+
try(BufferedReader br = new BufferedReader(new FileReader(path))) {
|
234 |
+
for(String line; (line = br.readLine()) != null; ) {
|
235 |
+
if (StringUtils.isNotEmpty(line.trim()))
|
236 |
+
{
|
237 |
+
String split[] = line.split("\\s+");
|
238 |
+
if (!dict.containsKey(split[0]))
|
239 |
+
{
|
240 |
+
dict.put(split[0], Double.parseDouble(split[1]));
|
241 |
+
}
|
242 |
+
}
|
243 |
+
}
|
244 |
+
// line is not visible here.
|
245 |
+
} catch (FileNotFoundException e) {
|
246 |
+
// TODO Auto-generated catch block
|
247 |
+
e.printStackTrace();
|
248 |
+
} catch (IOException e) {
|
249 |
+
// TODO Auto-generated catch block
|
250 |
+
e.printStackTrace();
|
251 |
+
}
|
252 |
+
|
253 |
+
return dict;
|
254 |
+
}
|
255 |
+
|
256 |
+
public static Map<String, Integer> loadRanks(String path)
|
257 |
+
{
|
258 |
+
Map<String, Integer> dict = Collections.synchronizedMap(new HashMap<String, Integer>());
|
259 |
+
|
260 |
+
try(BufferedReader br = new BufferedReader(new FileReader(path))) {
|
261 |
+
int counter = 0;
|
262 |
+
for(String line; (line = br.readLine()) != null; ) {
|
263 |
+
if (StringUtils.isNotEmpty(line.trim()))
|
264 |
+
{
|
265 |
+
counter++;
|
266 |
+
String split[] = line.split("\\s+");
|
267 |
+
if (!dict.containsKey(split[0]))
|
268 |
+
{
|
269 |
+
dict.put(split[0], counter);
|
270 |
+
}
|
271 |
+
}
|
272 |
+
}
|
273 |
+
// line is not visible here.
|
274 |
+
} catch (FileNotFoundException e) {
|
275 |
+
// TODO Auto-generated catch block
|
276 |
+
e.printStackTrace();
|
277 |
+
} catch (IOException e) {
|
278 |
+
// TODO Auto-generated catch block
|
279 |
+
e.printStackTrace();
|
280 |
+
}
|
281 |
+
|
282 |
+
return dict;
|
283 |
+
}
|
284 |
+
|
285 |
+
public static Map<String, Double> loadScoresLineByLine(String path, double treshold, Boolean sorted)
|
286 |
+
{
|
287 |
+
Map<String, Double> dict = Collections.synchronizedMap(new HashMap<String, Double>());
|
288 |
+
|
289 |
+
try(BufferedReader br = new BufferedReader(new FileReader(path))) {
|
290 |
+
for(String line; (line = br.readLine()) != null; ) {
|
291 |
+
if (StringUtils.isNotEmpty(line.trim()))
|
292 |
+
{
|
293 |
+
String split[] = line.split("\\s+");
|
294 |
+
if (!dict.containsKey(split[0]))
|
295 |
+
{
|
296 |
+
Double score = Double.parseDouble(split[1]);
|
297 |
+
if (score >= treshold) dict.put(split[0], score);
|
298 |
+
else if (sorted)
|
299 |
+
{
|
300 |
+
return dict;
|
301 |
+
}
|
302 |
+
}
|
303 |
+
}
|
304 |
+
}
|
305 |
+
// line is not visible here.
|
306 |
+
} catch (FileNotFoundException e) {
|
307 |
+
// TODO Auto-generated catch block
|
308 |
+
e.printStackTrace();
|
309 |
+
} catch (IOException e) {
|
310 |
+
// TODO Auto-generated catch block
|
311 |
+
e.printStackTrace();
|
312 |
+
}
|
313 |
+
|
314 |
+
return dict;
|
315 |
+
}
|
316 |
+
|
317 |
+
public static Map<String, Double> loadScoresLineByLine(String path, int topN)
|
318 |
+
{
|
319 |
+
Map<String, Double> dict = Collections.synchronizedMap(new HashMap<String, Double>());
|
320 |
+
|
321 |
+
try(BufferedReader br = new BufferedReader(new FileReader(path))) {
|
322 |
+
for(int i = 0; i < topN; i++) {
|
323 |
+
String line = br.readLine();
|
324 |
+
if (StringUtils.isNotEmpty(line.trim()))
|
325 |
+
{
|
326 |
+
String split[] = line.split("\\s+");
|
327 |
+
if (!dict.containsKey(split[0]))
|
328 |
+
{
|
329 |
+
Double score = Double.parseDouble(split[1]);
|
330 |
+
dict.put(split[0], score);
|
331 |
+
}
|
332 |
+
}
|
333 |
+
}
|
334 |
+
// line is not visible here.
|
335 |
+
} catch (FileNotFoundException e) {
|
336 |
+
// TODO Auto-generated catch block
|
337 |
+
e.printStackTrace();
|
338 |
+
} catch (IOException e) {
|
339 |
+
// TODO Auto-generated catch block
|
340 |
+
e.printStackTrace();
|
341 |
+
}
|
342 |
+
|
343 |
+
return dict;
|
344 |
+
}
|
345 |
+
|
346 |
+
public static HashMap<String, String> loadMappings(String path)
|
347 |
+
{
|
348 |
+
HashMap<String, String> dict = new HashMap<String, String>();
|
349 |
+
List<String> lines = getAllLines(path);
|
350 |
+
|
351 |
+
for(int i = 0; i < lines.size(); i++)
|
352 |
+
{
|
353 |
+
//if (i % 100 == 0) System.out.println("Loading mappings: " + String.valueOf(i+1) + "/" + String.valueOf(lines.size()));
|
354 |
+
String split[] = lines.get(i).split("\\s+");
|
355 |
+
if (!dict.containsKey(split[0]))
|
356 |
+
{
|
357 |
+
dict.put(split[0], split[1]);
|
358 |
+
}
|
359 |
+
}
|
360 |
+
return dict;
|
361 |
+
}
|
362 |
+
|
363 |
+
public static HashMap<String, List<String>> loadMultiMappings(String path)
|
364 |
+
{
|
365 |
+
HashMap<String, List<String>> dict = new HashMap<String, List<String>>();
|
366 |
+
List<String> lines = getAllLines(path);
|
367 |
+
|
368 |
+
for(int i = 0; i < lines.size(); i++)
|
369 |
+
{
|
370 |
+
//if (i % 100 == 0) System.out.println("Loading mappings: " + String.valueOf(i+1) + "/" + String.valueOf(lines.size()));
|
371 |
+
String split[] = lines.get(i).split("\\s+");
|
372 |
+
if (!dict.containsKey(split[0]))
|
373 |
+
{
|
374 |
+
dict.put(split[0], new ArrayList<String>());
|
375 |
+
}
|
376 |
+
dict.get(split[0]).add(split[1]);
|
377 |
+
}
|
378 |
+
return dict;
|
379 |
+
}
|
380 |
+
|
381 |
+
public static void writeStringToFile(String content, String path) throws IOException
|
382 |
+
{
|
383 |
+
FileUtils.writeStringToFile(new File(path), content, "UTF-8");
|
384 |
+
}
|
385 |
+
}
|
source/src/edu/uma/nlp/graphseg/utils/MemoryStorage.java
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
package edu.uma.nlp.graphseg.utils;
|
2 |
+
|
3 |
+
import edu.uma.nlp.graphseg.semantics.InformationContent;
|
4 |
+
import edu.uma.nlp.graphseg.semantics.WordVectorSpace;
|
5 |
+
|
6 |
+
public class MemoryStorage {
|
7 |
+
|
8 |
+
private static WordVectorSpace wordVectorSpace;
|
9 |
+
|
10 |
+
public static WordVectorSpace getWordVectorSpace() {
|
11 |
+
return wordVectorSpace;
|
12 |
+
}
|
13 |
+
public static void setWordVectorSpace(WordVectorSpace wordVectorSpace) {
|
14 |
+
MemoryStorage.wordVectorSpace = wordVectorSpace;
|
15 |
+
}
|
16 |
+
|
17 |
+
private static InformationContent informationContent;
|
18 |
+
|
19 |
+
public static InformationContent getInformationContent() {
|
20 |
+
return informationContent;
|
21 |
+
}
|
22 |
+
public static void setInformationContent(InformationContent informationContent) {
|
23 |
+
MemoryStorage.informationContent = informationContent;
|
24 |
+
}
|
25 |
+
|
26 |
+
}
|
source/src/edu/uma/nlp/graphseg/utils/VectorOperations.java
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
package edu.uma.nlp.graphseg.utils;
|
2 |
+
|
3 |
+
public class VectorOperations {
|
4 |
+
public static double cosine(double[] vector, double[] otherVector) throws Exception
|
5 |
+
{
|
6 |
+
if (vector.length != otherVector.length)
|
7 |
+
{
|
8 |
+
throw new UnsupportedOperationException("Vectors are of different length");
|
9 |
+
}
|
10 |
+
|
11 |
+
double dp = 0;
|
12 |
+
double sum1 = 0;
|
13 |
+
double sum2 = 0;
|
14 |
+
|
15 |
+
for (int i = 0; i < vector.length; i++)
|
16 |
+
{
|
17 |
+
dp += vector[i] * otherVector[i];
|
18 |
+
sum1 += vector[i] * vector[i];
|
19 |
+
sum2 += otherVector[i] * otherVector[i];
|
20 |
+
}
|
21 |
+
|
22 |
+
return dp / (Math.sqrt(sum1) * Math.sqrt(sum2));
|
23 |
+
}
|
24 |
+
|
25 |
+
public static void multiply(double[] vector, double factor)
|
26 |
+
{
|
27 |
+
for (int i = 0; i < vector.length; i++) vector[i] *= factor;
|
28 |
+
}
|
29 |
+
|
30 |
+
public double[] sumVectors(double[] vector, double[] otherVector)
|
31 |
+
{
|
32 |
+
if (vector.length != otherVector.length) throw new UnsupportedOperationException("Vectors are of different length");
|
33 |
+
|
34 |
+
double[] result = new double[vector.length];
|
35 |
+
for (int i = 0; i < vector.length; i++) result[i] = vector[i] += otherVector[i];
|
36 |
+
|
37 |
+
return result;
|
38 |
+
}
|
39 |
+
|
40 |
+
public static void addVector(double[] vector, double[] otherVector)
|
41 |
+
{
|
42 |
+
if (vector.length != otherVector.length) throw new UnsupportedOperationException("Vectors are of different length");
|
43 |
+
for (int i = 0; i < vector.length; i++) vector[i] += otherVector[i];
|
44 |
+
}
|
45 |
+
}
|