Update app.R
Browse files
@@ -1,6 +1,7 @@
1 |
2 |
3 |
4 |
5 |
# Load the French bsd model (ensure it's downloaded and adjust path if necessary)
6 |
model <- udpipe_load_model("french-gsd-ud-2.5-191206.udpipe")
@@ -11,42 +12,42 @@ ui <- fluidPage(
11 |
12 |
13 |
14 |
15 |
placeholder = "Type or paste French text here",
16 |
17 |
actionButton("analyze", "Analyze")
18 |
19 |
20 |
21 |
h3("Readability and Cohesion Features"),
22 |
23 |
24 |
25 |
26 |
27 |
# Define server logic
28 |
server <- function(input, output) {
29 |
30 |
31 |
32 |
33 |
# Annotate text using udpipe with the French model
34 |
annotated <- udpipe_annotate(model, x = text)
35 |
annotated_df <- as.data.frame(annotated)
36 |
37 |
# Calculate readability and cohesion metrics
38 |
39 |
# Basic Metrics
40 |
word_count <- nrow(annotated_df[annotated_df$upos %in% c("NOUN", "VERB", "ADJ", "ADV"), ])
41 |
sentence_count <- length(unique(annotated_df$sentence_id))
42 |
43 |
# Syllable count - count vowels in each token as an approximation
44 |
syllable_count <- sum(sapply(gregexpr("[aeiouyAEIOUY]", annotated_df$token), function(x) max(0, length(x))))
45 |
avg_sentence_length <- ifelse(sentence_count > 0, word_count / sentence_count, 0)
46 |
avg_syllables_per_word <- ifelse(word_count > 0, syllable_count / word_count, 0)
47 |
48 |
# Lexical Cohesion Metrics
49 |
# Sentence-to-Sentence Lexical Cohesion (percentage of words shared between sentences)
50 |
sentence_ids <- unique(annotated_df$sentence_id)
51 |
cohesion_values <- c()
52 |
for (i in 2:length(sentence_ids)) {
@@ -57,7 +58,6 @@ server <- function(input, output) {
57 |
58 |
avg_sentence_to_sentence_cohesion <- ifelse(length(cohesion_values) > 0, mean(cohesion_values, na.rm = TRUE), 0)
59 |
60 |
# Text-to-Sentence Lexical Cohesion (percentage of words in each sentence shared with entire text)
61 |
text_words <- unique(annotated_df$lemma)
62 |
text_sentence_cohesion <- sapply(sentence_ids, function(sid) {
63 |
sentence_words <- annotated_df[annotated_df$sentence_id == sid, "lemma"]
@@ -66,29 +66,74 @@ server <- function(input, output) {
66 |
67 |
avg_text_to_sentence_cohesion <- mean(text_sentence_cohesion, na.rm = TRUE)
68 |
69 |
# Type-Token Ratio (vocabulary diversity)
70 |
type_token_ratio <- length(unique(annotated_df$lemma)) / word_count
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
round(avg_sentence_to_sentence_cohesion, 2),
82 |
round(avg_text_to_sentence_cohesion, 2),
83 |
round(type_token_ratio, 2))
84 |
85 |
86 |
87 |
88 |
89 |
output$results <- renderTable({
90 |
91 |
92 |
93 |
94 |
# Run the application
1 |
2 |
3 |
4 |
5 |
6 |
# Load the French bsd model (ensure it's downloaded and adjust path if necessary)
7 |
model <- udpipe_load_model("french-gsd-ud-2.5-191206.udpipe")
12 |
13 |
14 |
15 |
fileInput("corpus_zip", "Upload ZIP with TXT files (optional)",
16 |
accept = c(".zip")),
17 |
textAreaInput("text", "Or enter French text directly:", value = "",
18 |
placeholder = "Type or paste French text here",
19 |
width = '100%', height = '200px', resize = "both"),
20 |
actionButton("analyze", "Analyze")
21 |
22 |
23 |
24 |
h3("Readability and Cohesion Features"),
25 |
26 |
condition = "output.isCorpus == false",
27 |
28 |
29 |
30 |
condition = "output.isCorpus == true",
31 |
32 |
33 |
34 |
35 |
36 |
37 |
# Define server logic
38 |
server <- function(input, output, session) {
39 |
40 |
# Helper function to calculate metrics for a given text
41 |
calculate_metrics <- function(text) {
42 |
annotated <- udpipe_annotate(model, x = text)
43 |
annotated_df <- as.data.frame(annotated)
44 |
45 |
word_count <- nrow(annotated_df[annotated_df$upos %in% c("NOUN", "VERB", "ADJ", "ADV"), ])
46 |
sentence_count <- length(unique(annotated_df$sentence_id))
47 |
syllable_count <- sum(sapply(gregexpr("[aeiouyAEIOUY]", annotated_df$token), function(x) max(0, length(x))))
48 |
avg_sentence_length <- ifelse(sentence_count > 0, word_count / sentence_count, 0)
49 |
avg_syllables_per_word <- ifelse(word_count > 0, syllable_count / word_count, 0)
50 |
51 |
sentence_ids <- unique(annotated_df$sentence_id)
52 |
cohesion_values <- c()
53 |
for (i in 2:length(sentence_ids)) {
58 |
59 |
avg_sentence_to_sentence_cohesion <- ifelse(length(cohesion_values) > 0, mean(cohesion_values, na.rm = TRUE), 0)
60 |
61 |
text_words <- unique(annotated_df$lemma)
62 |
text_sentence_cohesion <- sapply(sentence_ids, function(sid) {
63 |
sentence_words <- annotated_df[annotated_df$sentence_id == sid, "lemma"]
66 |
67 |
avg_text_to_sentence_cohesion <- mean(text_sentence_cohesion, na.rm = TRUE)
68 |
69 |
type_token_ratio <- length(unique(annotated_df$lemma)) / word_count
70 |
71 |
72 |
"Word Count" = word_count,
73 |
"Sentence Count" = sentence_count,
74 |
"Syllable Count" = syllable_count,
75 |
"Average Sentence Length" = round(avg_sentence_length, 2),
76 |
"Average Syllables per Word" = round(avg_syllables_per_word, 2),
77 |
"Sentence-to-Sentence Lexical Cohesion" = round(avg_sentence_to_sentence_cohesion, 2),
78 |
"Text-to-Sentence Lexical Cohesion" = round(avg_text_to_sentence_cohesion, 2),
79 |
"Type-Token Ratio" = round(type_token_ratio, 2)
80 |
81 |
82 |
83 |
# Reactive to handle single text or corpus input
84 |
results <- eventReactive(input$analyze, {
85 |
if (is.null(input$corpus_zip)) {
86 |
# Single text mode
87 |
text <- input$text
88 |
if (nchar(text) > 0) {
89 |
list(data = calculate_metrics(text), isCorpus = FALSE)
90 |
} else {
91 |
92 |
93 |
} else {
94 |
# Corpus mode: analyze each file in the uploaded ZIP
95 |
temp_dir <- tempdir()
96 |
unzip(input$corpus_zip$datapath, exdir = temp_dir)
97 |
txt_files <- list.files(temp_dir, pattern = "\\.txt$", full.names = TRUE)
98 |
99 |
# Calculate metrics for each text file and store in a list
100 |
corpus_metrics <- lapply(txt_files, function(file) {
101 |
text <- readLines(file, warn = FALSE)
102 |
calculate_metrics(paste(text, collapse = " "))
103 |
104 |
105 |
# Combine metrics into a data frame
106 |
corpus_metrics_df <- do.call(rbind, corpus_metrics)
107 |
list(data = corpus_metrics_df, isCorpus = TRUE)
108 |
109 |
110 |
111 |
# Display results table for single text mode
112 |
output$results <- renderTable({
113 |
if (!is.null(results()) && !results()$isCorpus) {
114 |
115 |
116 |
117 |
118 |
# Display box plots for corpus mode
119 |
output$corpusPlots <- renderPlot({
120 |
if (!is.null(results()) && results()$isCorpus) {
121 |
corpus_metrics_df <- results()$data
122 |
melted_df <- reshape2::melt(corpus_metrics_df)
123 |
124 |
ggplot(melted_df, aes(x = variable, y = value)) +
125 |
geom_boxplot() +
126 |
labs(x = "Metric", y = "Value", title = "Corpus Analysis - Readability and Cohesion Metrics") +
127 |
theme_minimal() +
128 |
theme(axis.text.x = element_text(angle = 45, hjust = 1))
129 |
130 |
131 |
132 |
# Boolean for UI conditionals
133 |
output$isCorpus <- reactive({
134 |
!is.null(results()) && results()$isCorpus
135 |
136 |
outputOptions(output, "isCorpus", suspendWhenHidden = FALSE)
137 |
138 |
139 |
# Run the application