::p_load(
pacman
rio,
here,
janitor,
udpipe,
quanteda,
quanteda.textplots,
quanteda.textstats,
tidyverse
)# pacman::p_load_gh("trinker/entity")
# remotes::install_gitlab("culturalcartography/text2map.theme")
::set_theme()
text2map.theme# remotes::install_github("quanteda/quanteda.corpora")
# remotes::install_gitlab("culturalcartography/text2map.corpora")
<- import(here("data", "corpus.csv")) corpus
CausalizeR package testing
In [1]:
In [2]:
<-function(effect=NULL,sentence=NULL){
passivevoice<-NULL
driver<-NULL
affectedif(!effect%in%sentence$lemma){}else{
for(k in (which(sentence$lemma==effect)-1)[1]:1){
if( sentence$xpos[k]=="NN"|sentence$xpos[k]=="NNS"){
if(k==1){ affected<-sentence$lemma[k]}else
if(sentence$xpos[k-1]=="NN"|sentence$xpos[k-1]=="NNS"|sentence$xpos[k-1]=="JJ"){
<-paste(sentence$lemma[k-1],sentence$lemma[k],collapse="_")} else{
affected<-sentence$lemma[k]}
affected#Close the loop that finds a NOUN before the "effect"
}
if( sentence$xpos[k]=="NN"|sentence$xpos[k]=="NNS"){break}
#Close the affected loop
}
for(m in (which(sentence$lemma==effect)+1)[1]:length(sentence$lemma)){
if( sentence$xpos[m]=="NN"|sentence$xpos[m]=="NNS"){
if(m==length(sentence$lemma)){
if(sentence$xpos[m-1]=="JJ"){driver<-paste(sentence$lemma[m-1],sentence$lemma[m],collapse="_")} else {
<-sentence$lemma[m]}} else
driverif(sentence$xpos[m+1]=="NN"|sentence$xpos[m+1]=="NNS"){ # for "JJ" it is m-1 because adjectives always come before the sustantive
<-paste(sentence$lemma[m],sentence$lemma[m+1],collapse="_")}
driver
else{
<-sentence$lemma[m]
driver
}if( sentence$xpos[m]=="NN"|sentence$xpos[k]=="NNS"){break}
#Close the loop that finds a NOUN after the "effect"
}
}#Close the driver loop
} if(!is.null(driver)&&!is.null(affected)){
return(cbind.data.frame(driver=driver,affected=affected,effect=effect))
}
}
<-function(effect=NULL,sentence=NULL){
activevoice
<-NULL
driver<-NULL
affectedif(!effect%in%sentence$lemma){}else{
for(k in (which(sentence$lemma==effect)-1)[1]:1){
if( sentence$xpos[k]=="NN"|sentence$xpos[k]=="NNS"){
if(k==1){ driver<-sentence$lemma[k]}else
if(sentence$xpos[k-1]=="NN"|sentence$xpos[k-1]=="NNS"|sentence$xpos[k-1]=="JJ"){
<-paste(sentence$lemma[k-1],sentence$lemma[k],collapse="_")} else{
driver<-sentence$lemma[k]}
driver#Close the loop that finds a NOUN before the "effect"
}
if( sentence$xpos[k]=="NN"){break}
#Close the driver loop
}
for(m in (which(sentence$lemma==effect)+1)[1]:length(sentence$lemma)){
if( sentence$xpos[m]=="NN"|sentence$xpos[m]=="NNS"){
if(m==length(sentence$lemma)){
if(sentence$xpos[m-1]=="JJ"){affected<-paste(sentence$lemma[m-1],sentence$lemma[m],collapse="_")} else {
<-sentence$lemma[m]}} else
affectedif(sentence$xpos[m+1]=="NN"||sentence$xpos[m+1]=="NNS"){ # for "JJ" it is m-1 because adjectives always come before the sustantive
<-paste(sentence$lemma[m],sentence$lemma[m+1],collapse="_")}
affected
else{
<-sentence$lemma[m]
affected
}if( sentence$xpos[m]=="NN"){break}
#Close the loop that finds a NOUN after the "effect"
}
}#Close the affected loop
} if(!is.null(driver)&&!is.null(affected)){
return(cbind.data.frame(driver=driver,affected=affected,effect=effect))
}#Close the conditional that looks for nouns before and after the verb
}
<- function(texts = NULL, effect = NULL, effect_num = 0, tokenizer = NULL) {
causalize if (is.null(tokenizer)) {
<- udpipe_download_model("english")
tokenizer <- udpipe_load_model(tokenizer$file_model)
tokenizer
}
<- txtProgressBar(min = 0, max = length(texts), style = 3)
progbar <- list()
driver <- list()
affected <- list()
res.mat
for (w in 1:length(texts)) {
<- as.data.frame(udpipe_annotate(tokenizer, texts[w]))
annotated_text
<- NULL
sent.num
for (i in 1:length(unique(annotated_text$sentence_id))) {
if (effect %in% annotated_text$lemma[annotated_text$sentence_id == i]) {
<- c(sent.num, i)
sent.num
}
}
if (is.null(sent.num)) {
next
else {
} for (j in sent.num) {
<- annotated_text[annotated_text$sentence_id == j, ]
sent1
## Ensure that there are nouns before and after the verb. Skip sentence if this requirement is not met
<- which(sent1$lemma == effect)
effect_indices if (any(effect_indices == 1)) {
next
else {
} if ("NOUN" %in% sent1$upos[(effect_indices[1] - 1):1]) {
if ("NOUN" %in% sent1$upos[(effect_indices[1] + 1):length(sent1$lemma)]) {
if (length(grep("in response to|\\bby\\b|\\bwith\\b|\\bas a result of", sent1)) > 0) {
length(res.mat) + 1]] <- cbind.data.frame(passivevoice(effect = effect, sentence = sent1), effect_num = effect_num, text_number = w)
res.mat[[else {
} length(res.mat) + 1]] <- cbind.data.frame(activevoice(effect = effect, sentence = sent1), effect_num = effect_num, text_number = w)
res.mat[[
}
}
}
}
}
}setTxtProgressBar(progbar, w)
}
return(do.call(rbind.data.frame, res.mat))
}
Package description
Determine whether certain sentences contain a specified effect (a word or phrase), and then classify these sentences as either active or passive voice. It uses the udpipe package for tokenization and annotation of the text.
Initialization: Loads a udpipe model for English if no tokenizer is provided. Sets up a progress bar to monitor the analysis progress across the text entries.
Data Processing Loop: Iterates over each text entry, tokenizes and annotates it to create a dataframe where each word is broken down by sentence, lemma, and part-of-speech tag. Identifies sentences that contain the specified effect (word or phrase).
Voice Classification: For each sentence containing the effect, it checks the position of the effect and the types of words before and after it. If the word before and after the effect are nouns, further checks are performed to classify the sentence as active or passive based on the presence of specific prepositions or phrases like “in response to”, “by”, or “with”. Depending on the classification, sentences are processed by either a passivevoice or activevoice function (these functions return a dataframe describing the sentence in context).
The passivevoice and activevoice functions are used to extract the subjects (drivers) and objects (affected entities) related to the effect in the sentence: passivevoice Function: Searches for nouns before the effect to identify the affected entity. Searches for nouns after the effect to identify the driver. activevoice Function: Searches for nouns before the effect to identify the driver. Searches for nouns after the effect to identify the affected entity.
In [3]:
# effects<-cbind.data.frame(effect=c("decrease","increase"),effect_direction=c((-1),1))
# temp.list<-list()
# for(i in 1:length(effects$effect)){
# temp.list[[i]]<-causalize(corpus$Text,effect=effects[i,1],effect_num=effects[i,2])
# }
# saveRDS(temp.list, here("data/drivers_list.rds"))
# decrease_driver <- temp.list[[1]]
# increase_driver <- temp.list[[2]]
# # Error in if (which(sent1$lemma == effect) == 1) { :
# # the condition has length > 1
# # export temp.list to .Rds file
# # test <- import(here("data/drivers_list.rds"))
# dat<-do.call(rbind.data.frame,temp.list)
# dat$driver<-as.character(dat$driver)
# dat$affected<-as.character(dat$affected)
View result
In [4]:
<- import(here("data/drivers_list.rds"))
dat ::datatable(dat) DT
Small example
In [5]:
# effects<-cbind.data.frame(effect=c("decrease","increase"),effect_direction=c((-1),1))
# example.list<-list()
# for(i in 1:length(effects$effect)){
# example.list[[i]]<-causalize(corpus$Text[1:5],effect=effects[i,1],effect_num=effects[i,2])
# }
# saveRDS(example.list, here("data/example_list.rds"))
# example.list <- import(here("data/example_list.rds"))
# decrease_driver <- example.list[[1]]
# increase_driver <- example.list[[2]]
# # Error in if (which(sent1$lemma == effect) == 1) { :
# # the condition has length > 1
# # export temp.list to .Rds file
# # test <- import(here("data/drivers_list.rds"))
# example<-do.call(rbind.data.frame,example.list)
# example$driver<-as.character(example$driver)
# example$affected<-as.character(example$affected)
<- import(here("data/example_list.rds"))
example ::datatable(example) DT