Update app.R
Browse files
app.R
CHANGED
@@ -1,21 +1,24 @@
|
|
1 |
library(shiny)
|
2 |
library(udpipe)
|
|
|
3 |
|
4 |
# Load the French bsd model (ensure it's downloaded and adjust path if necessary)
|
5 |
model <- udpipe_load_model("french-gsd-ud-2.5-191206.udpipe")
|
6 |
|
7 |
# Define UI for the application
|
8 |
ui <- fluidPage(
|
9 |
-
titlePanel("French Readability Analyzer with UDPipe"),
|
10 |
|
11 |
sidebarLayout(
|
12 |
sidebarPanel(
|
13 |
-
|
|
|
|
|
14 |
actionButton("analyze", "Analyze")
|
15 |
),
|
16 |
|
17 |
mainPanel(
|
18 |
-
h3("Readability Features"),
|
19 |
tableOutput("results")
|
20 |
)
|
21 |
)
|
@@ -31,27 +34,53 @@ server <- function(input, output) {
|
|
31 |
annotated <- udpipe_annotate(model, x = text)
|
32 |
annotated_df <- as.data.frame(annotated)
|
33 |
|
34 |
-
# Calculate readability metrics
|
35 |
-
word_count <- nrow(annotated_df[annotated_df$upos == "NOUN" | annotated_df$upos == "VERB" |
|
36 |
-
annotated_df$upos == "ADJ" | annotated_df$upos == "ADV", ])
|
37 |
|
|
|
|
|
38 |
sentence_count <- length(unique(annotated_df$sentence_id))
|
39 |
|
40 |
# Syllable count - count vowels in each token as an approximation
|
41 |
syllable_count <- sum(sapply(gregexpr("[aeiouyAEIOUY]", annotated_df$token), function(x) max(0, length(x))))
|
42 |
-
|
43 |
-
# Average sentence length (words per sentence)
|
44 |
avg_sentence_length <- ifelse(sentence_count > 0, word_count / sentence_count, 0)
|
45 |
-
|
46 |
-
# Average syllables per word
|
47 |
avg_syllables_per_word <- ifelse(word_count > 0, syllable_count / word_count, 0)
|
48 |
|
49 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
results <- data.frame(
|
51 |
Metric = c("Word Count", "Sentence Count", "Syllable Count",
|
52 |
-
"Average Sentence Length", "Average Syllables per Word"
|
|
|
|
|
|
|
53 |
Value = c(word_count, sentence_count, syllable_count,
|
54 |
-
round(avg_sentence_length, 2), round(avg_syllables_per_word, 2)
|
|
|
|
|
|
|
55 |
)
|
56 |
|
57 |
return(results)
|
|
|
1 |
library(shiny)
|
2 |
library(udpipe)
|
3 |
+
library(stringr)
|
4 |
|
5 |
# Load the French bsd model (ensure it's downloaded and adjust path if necessary)
|
6 |
model <- udpipe_load_model("french-gsd-ud-2.5-191206.udpipe")
|
7 |
|
8 |
# Define UI for the application
|
9 |
ui <- fluidPage(
|
10 |
+
titlePanel("French Readability and Cohesion Analyzer with UDPipe"),
|
11 |
|
12 |
sidebarLayout(
|
13 |
sidebarPanel(
|
14 |
+
textAreaInput("text", "Enter French text:", value = "",
|
15 |
+
placeholder = "Type or paste French text here",
|
16 |
+
width = '100%', height = '200px', resize = "both"),
|
17 |
actionButton("analyze", "Analyze")
|
18 |
),
|
19 |
|
20 |
mainPanel(
|
21 |
+
h3("Readability and Cohesion Features"),
|
22 |
tableOutput("results")
|
23 |
)
|
24 |
)
|
|
|
34 |
annotated <- udpipe_annotate(model, x = text)
|
35 |
annotated_df <- as.data.frame(annotated)
|
36 |
|
37 |
+
# Calculate readability and cohesion metrics
|
|
|
|
|
38 |
|
39 |
+
# Basic Metrics
|
40 |
+
word_count <- nrow(annotated_df[annotated_df$upos %in% c("NOUN", "VERB", "ADJ", "ADV"), ])
|
41 |
sentence_count <- length(unique(annotated_df$sentence_id))
|
42 |
|
43 |
# Syllable count - count vowels in each token as an approximation
|
44 |
syllable_count <- sum(sapply(gregexpr("[aeiouyAEIOUY]", annotated_df$token), function(x) max(0, length(x))))
|
|
|
|
|
45 |
avg_sentence_length <- ifelse(sentence_count > 0, word_count / sentence_count, 0)
|
|
|
|
|
46 |
avg_syllables_per_word <- ifelse(word_count > 0, syllable_count / word_count, 0)
|
47 |
|
48 |
+
# Lexical Cohesion Metrics
|
49 |
+
# Sentence-to-Sentence Lexical Cohesion (percentage of words shared between sentences)
|
50 |
+
sentence_ids <- unique(annotated_df$sentence_id)
|
51 |
+
cohesion_values <- c()
|
52 |
+
for (i in 2:length(sentence_ids)) {
|
53 |
+
current_sentence <- annotated_df[annotated_df$sentence_id == sentence_ids[i], "lemma"]
|
54 |
+
previous_sentence <- annotated_df[annotated_df$sentence_id == sentence_ids[i - 1], "lemma"]
|
55 |
+
shared_words <- length(intersect(current_sentence, previous_sentence))
|
56 |
+
cohesion_values <- c(cohesion_values, shared_words / length(current_sentence))
|
57 |
+
}
|
58 |
+
avg_sentence_to_sentence_cohesion <- ifelse(length(cohesion_values) > 0, mean(cohesion_values, na.rm = TRUE), 0)
|
59 |
+
|
60 |
+
# Text-to-Sentence Lexical Cohesion (percentage of words in each sentence shared with entire text)
|
61 |
+
text_words <- unique(annotated_df$lemma)
|
62 |
+
text_sentence_cohesion <- sapply(sentence_ids, function(sid) {
|
63 |
+
sentence_words <- annotated_df[annotated_df$sentence_id == sid, "lemma"]
|
64 |
+
shared_words <- length(intersect(sentence_words, text_words))
|
65 |
+
shared_words / length(sentence_words)
|
66 |
+
})
|
67 |
+
avg_text_to_sentence_cohesion <- mean(text_sentence_cohesion, na.rm = TRUE)
|
68 |
+
|
69 |
+
# Type-Token Ratio (vocabulary diversity)
|
70 |
+
type_token_ratio <- length(unique(annotated_df$lemma)) / word_count
|
71 |
+
|
72 |
+
# Results Data Frame
|
73 |
results <- data.frame(
|
74 |
Metric = c("Word Count", "Sentence Count", "Syllable Count",
|
75 |
+
"Average Sentence Length", "Average Syllables per Word",
|
76 |
+
"Avg Sentence-to-Sentence Lexical Cohesion",
|
77 |
+
"Avg Text-to-Sentence Lexical Cohesion",
|
78 |
+
"Type-Token Ratio (Vocabulary Diversity)"),
|
79 |
Value = c(word_count, sentence_count, syllable_count,
|
80 |
+
round(avg_sentence_length, 2), round(avg_syllables_per_word, 2),
|
81 |
+
round(avg_sentence_to_sentence_cohesion, 2),
|
82 |
+
round(avg_text_to_sentence_cohesion, 2),
|
83 |
+
round(type_token_ratio, 2))
|
84 |
)
|
85 |
|
86 |
return(results)
|