Update app.R
Browse files
app.R
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
library(shiny)
|
2 |
library(udpipe)
|
3 |
library(stringr)
|
|
|
4 |
|
5 |
# Load the French bsd model (ensure it's downloaded and adjust path if necessary)
|
6 |
model <- udpipe_load_model("french-gsd-ud-2.5-191206.udpipe")
|
@@ -11,42 +12,42 @@ ui <- fluidPage(
|
|
11 |
|
12 |
sidebarLayout(
|
13 |
sidebarPanel(
|
14 |
-
|
|
|
|
|
15 |
placeholder = "Type or paste French text here",
|
16 |
-
width
|
17 |
actionButton("analyze", "Analyze")
|
18 |
),
|
19 |
|
20 |
mainPanel(
|
21 |
h3("Readability and Cohesion Features"),
|
22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
)
|
24 |
)
|
25 |
)
|
26 |
|
27 |
# Define server logic
|
28 |
-
server <- function(input, output) {
|
29 |
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
# Annotate text using udpipe with the French model
|
34 |
annotated <- udpipe_annotate(model, x = text)
|
35 |
annotated_df <- as.data.frame(annotated)
|
36 |
|
37 |
-
# Calculate readability and cohesion metrics
|
38 |
-
|
39 |
-
# Basic Metrics
|
40 |
word_count <- nrow(annotated_df[annotated_df$upos %in% c("NOUN", "VERB", "ADJ", "ADV"), ])
|
41 |
sentence_count <- length(unique(annotated_df$sentence_id))
|
42 |
-
|
43 |
-
# Syllable count - count vowels in each token as an approximation
|
44 |
syllable_count <- sum(sapply(gregexpr("[aeiouyAEIOUY]", annotated_df$token), function(x) max(0, length(x))))
|
45 |
avg_sentence_length <- ifelse(sentence_count > 0, word_count / sentence_count, 0)
|
46 |
avg_syllables_per_word <- ifelse(word_count > 0, syllable_count / word_count, 0)
|
47 |
|
48 |
-
# Lexical Cohesion Metrics
|
49 |
-
# Sentence-to-Sentence Lexical Cohesion (percentage of words shared between sentences)
|
50 |
sentence_ids <- unique(annotated_df$sentence_id)
|
51 |
cohesion_values <- c()
|
52 |
for (i in 2:length(sentence_ids)) {
|
@@ -57,7 +58,6 @@ server <- function(input, output) {
|
|
57 |
}
|
58 |
avg_sentence_to_sentence_cohesion <- ifelse(length(cohesion_values) > 0, mean(cohesion_values, na.rm = TRUE), 0)
|
59 |
|
60 |
-
# Text-to-Sentence Lexical Cohesion (percentage of words in each sentence shared with entire text)
|
61 |
text_words <- unique(annotated_df$lemma)
|
62 |
text_sentence_cohesion <- sapply(sentence_ids, function(sid) {
|
63 |
sentence_words <- annotated_df[annotated_df$sentence_id == sid, "lemma"]
|
@@ -66,29 +66,74 @@ server <- function(input, output) {
|
|
66 |
})
|
67 |
avg_text_to_sentence_cohesion <- mean(text_sentence_cohesion, na.rm = TRUE)
|
68 |
|
69 |
-
# Type-Token Ratio (vocabulary diversity)
|
70 |
type_token_ratio <- length(unique(annotated_df$lemma)) / word_count
|
71 |
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
round(avg_sentence_to_sentence_cohesion, 2),
|
82 |
-
round(avg_text_to_sentence_cohesion, 2),
|
83 |
-
round(type_token_ratio, 2))
|
84 |
)
|
85 |
-
|
86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
})
|
88 |
|
|
|
89 |
output$results <- renderTable({
|
90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
})
|
|
|
92 |
}
|
93 |
|
94 |
# Run the application
|
|
|
1 |
library(shiny)
|
2 |
library(udpipe)
|
3 |
library(stringr)
|
4 |
+
library(ggplot2)
|
5 |
|
6 |
# Load the French bsd model (ensure it's downloaded and adjust path if necessary)
|
7 |
model <- udpipe_load_model("french-gsd-ud-2.5-191206.udpipe")
|
|
|
12 |
|
13 |
sidebarLayout(
|
14 |
sidebarPanel(
|
15 |
+
fileInput("corpus_zip", "Upload ZIP with TXT files (optional)",
|
16 |
+
accept = c(".zip")),
|
17 |
+
textAreaInput("text", "Or enter French text directly:", value = "",
|
18 |
placeholder = "Type or paste French text here",
|
19 |
+
width = '100%', height = '200px', resize = "both"),
|
20 |
actionButton("analyze", "Analyze")
|
21 |
),
|
22 |
|
23 |
mainPanel(
|
24 |
h3("Readability and Cohesion Features"),
|
25 |
+
conditionalPanel(
|
26 |
+
condition = "output.isCorpus == false",
|
27 |
+
tableOutput("results")
|
28 |
+
),
|
29 |
+
conditionalPanel(
|
30 |
+
condition = "output.isCorpus == true",
|
31 |
+
plotOutput("corpusPlots")
|
32 |
+
)
|
33 |
)
|
34 |
)
|
35 |
)
|
36 |
|
37 |
# Define server logic
|
38 |
+
server <- function(input, output, session) {
|
39 |
|
40 |
+
# Helper function to calculate metrics for a given text
|
41 |
+
calculate_metrics <- function(text) {
|
|
|
|
|
42 |
annotated <- udpipe_annotate(model, x = text)
|
43 |
annotated_df <- as.data.frame(annotated)
|
44 |
|
|
|
|
|
|
|
45 |
word_count <- nrow(annotated_df[annotated_df$upos %in% c("NOUN", "VERB", "ADJ", "ADV"), ])
|
46 |
sentence_count <- length(unique(annotated_df$sentence_id))
|
|
|
|
|
47 |
syllable_count <- sum(sapply(gregexpr("[aeiouyAEIOUY]", annotated_df$token), function(x) max(0, length(x))))
|
48 |
avg_sentence_length <- ifelse(sentence_count > 0, word_count / sentence_count, 0)
|
49 |
avg_syllables_per_word <- ifelse(word_count > 0, syllable_count / word_count, 0)
|
50 |
|
|
|
|
|
51 |
sentence_ids <- unique(annotated_df$sentence_id)
|
52 |
cohesion_values <- c()
|
53 |
for (i in 2:length(sentence_ids)) {
|
|
|
58 |
}
|
59 |
avg_sentence_to_sentence_cohesion <- ifelse(length(cohesion_values) > 0, mean(cohesion_values, na.rm = TRUE), 0)
|
60 |
|
|
|
61 |
text_words <- unique(annotated_df$lemma)
|
62 |
text_sentence_cohesion <- sapply(sentence_ids, function(sid) {
|
63 |
sentence_words <- annotated_df[annotated_df$sentence_id == sid, "lemma"]
|
|
|
66 |
})
|
67 |
avg_text_to_sentence_cohesion <- mean(text_sentence_cohesion, na.rm = TRUE)
|
68 |
|
|
|
69 |
type_token_ratio <- length(unique(annotated_df$lemma)) / word_count
|
70 |
|
71 |
+
data.frame(
|
72 |
+
"Word Count" = word_count,
|
73 |
+
"Sentence Count" = sentence_count,
|
74 |
+
"Syllable Count" = syllable_count,
|
75 |
+
"Average Sentence Length" = round(avg_sentence_length, 2),
|
76 |
+
"Average Syllables per Word" = round(avg_syllables_per_word, 2),
|
77 |
+
"Sentence-to-Sentence Lexical Cohesion" = round(avg_sentence_to_sentence_cohesion, 2),
|
78 |
+
"Text-to-Sentence Lexical Cohesion" = round(avg_text_to_sentence_cohesion, 2),
|
79 |
+
"Type-Token Ratio" = round(type_token_ratio, 2)
|
|
|
|
|
|
|
80 |
)
|
81 |
+
}
|
82 |
+
|
83 |
+
# Reactive to handle single text or corpus input
|
84 |
+
results <- eventReactive(input$analyze, {
|
85 |
+
if (is.null(input$corpus_zip)) {
|
86 |
+
# Single text mode
|
87 |
+
text <- input$text
|
88 |
+
if (nchar(text) > 0) {
|
89 |
+
list(data = calculate_metrics(text), isCorpus = FALSE)
|
90 |
+
} else {
|
91 |
+
NULL
|
92 |
+
}
|
93 |
+
} else {
|
94 |
+
# Corpus mode: analyze each file in the uploaded ZIP
|
95 |
+
temp_dir <- tempdir()
|
96 |
+
unzip(input$corpus_zip$datapath, exdir = temp_dir)
|
97 |
+
txt_files <- list.files(temp_dir, pattern = "\\.txt$", full.names = TRUE)
|
98 |
+
|
99 |
+
# Calculate metrics for each text file and store in a list
|
100 |
+
corpus_metrics <- lapply(txt_files, function(file) {
|
101 |
+
text <- readLines(file, warn = FALSE)
|
102 |
+
calculate_metrics(paste(text, collapse = " "))
|
103 |
+
})
|
104 |
+
|
105 |
+
# Combine metrics into a data frame
|
106 |
+
corpus_metrics_df <- do.call(rbind, corpus_metrics)
|
107 |
+
list(data = corpus_metrics_df, isCorpus = TRUE)
|
108 |
+
}
|
109 |
})
|
110 |
|
111 |
+
# Display results table for single text mode
|
112 |
output$results <- renderTable({
|
113 |
+
if (!is.null(results()) && !results()$isCorpus) {
|
114 |
+
results()$data
|
115 |
+
}
|
116 |
+
})
|
117 |
+
|
118 |
+
# Display box plots for corpus mode
|
119 |
+
output$corpusPlots <- renderPlot({
|
120 |
+
if (!is.null(results()) && results()$isCorpus) {
|
121 |
+
corpus_metrics_df <- results()$data
|
122 |
+
melted_df <- reshape2::melt(corpus_metrics_df)
|
123 |
+
|
124 |
+
ggplot(melted_df, aes(x = variable, y = value)) +
|
125 |
+
geom_boxplot() +
|
126 |
+
labs(x = "Metric", y = "Value", title = "Corpus Analysis - Readability and Cohesion Metrics") +
|
127 |
+
theme_minimal() +
|
128 |
+
theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
129 |
+
}
|
130 |
+
})
|
131 |
+
|
132 |
+
# Boolean for UI conditionals
|
133 |
+
output$isCorpus <- reactive({
|
134 |
+
!is.null(results()) && results()$isCorpus
|
135 |
})
|
136 |
+
outputOptions(output, "isCorpus", suspendWhenHidden = FALSE)
|
137 |
}
|
138 |
|
139 |
# Run the application
|