knitr::opts_chunk$set()
unnest_tokens()
and unnest_ngrams()
to
extract tokens and ngrams from text.For this lab we will be working with the medical record transcriptions from https://www.mtsamples.com/. And is loaded and “fairly” cleaned at https://github.com/JSC370/jsc370-2023/blob/main/data/medical_transcriptions/.
You should load in dplyr
, (or data.table
if
you want to work that way), ggplot2
and
tidytext
. If you don’t already have tidytext
then you can install with
install.packages("tidytext")
Loading in reference transcription samples from https://www.mtsamples.com/
library(tidytext)
library(readr)
library(dplyr)
library(tidyr)
library(ggplot2)
data_url <- paste0(
"https://raw.githubusercontent.com/JSC370/",
"jsc370-2023/main/data/medical_transcriptions/mtsamples.csv"
)
mt_samples <- read_csv(data_url)
mt_samples <- mt_samples |>
select(description, medical_specialty, transcription)
head(mt_samples)
## # A tibble: 6 × 3
## description medic…¹ trans…²
## <chr> <chr> <chr>
## 1 A 23-year-old white female presents with complaint of allergi… Allerg… "SUBJE…
## 2 Consult for laparoscopic gastric bypass. Bariat… "PAST …
## 3 Consult for laparoscopic gastric bypass. Bariat… "HISTO…
## 4 2-D M-Mode. Doppler. Cardio… "2-D M…
## 5 2-D Echocardiogram Cardio… "1. T…
## 6 Morbid obesity. Laparoscopic antecolic antegastric Roux-en-Y… Bariat… "PREOP…
## # … with abbreviated variable names ¹medical_specialty, ²transcription
We can use count()
from dplyr
to figure out
how many different categories we have. Are these categories related?
overlapping? evenly distributed?
mt_samples |>
count(medical_specialty, sort = TRUE) |>
mutate(medical_specialty = forcats::fct_reorder(medical_specialty, n)) |>
ggplot(aes(medical_specialty, n)) +
theme_minimal() +
geom_col() +
coord_flip() +
labs(x = NULL, y = NULL)
transcription
columnExplain what we see from this result. Does it makes sense? What insights (if any) do we get?
tokens <- mt_samples |>
select(transcription) |>
unnest_tokens(word, transcription) |>
count(word)
tokens |>
slice_max(n, n = 20) |>
ggplot(aes(reorder(word, n), n)) +
theme_minimal() +
labs(y = NULL, x = NULL) +
geom_bar(stat = "identity") +
coord_flip()
What do we see know that we have removed stop words? Does it give us a better idea of what the text is about?
library(stopwords)
head(stopwords("english"))
## [1] "i" "me" "my" "myself" "we" "our"
length(stopwords("english"))
## [1] 175
tokens_no_stopwords <- tokens |>
filter(
!word %in% stopwords("english") # remove stop words
# remove numbers:
# you can use grepl() or
# stringr::str_detect()
)
tokens_no_stopwords |>
slice_max(n, n = 20) |>
ggplot(aes(reorder(word, n), n)) +
theme_minimal() +
labs(y = NULL, x = NULL) +
geom_bar(stat = "identity") +
coord_flip()
Another method for visualizing word counts is using a word cloud via
wordcloud::wordcloud()
. Create a word cloud for the top 50
most frequent words after removing stop words (and numbers).
library(wordcloud)
## Loading required package: RColorBrewer
tokens50 <- tokens_no_stopwords |>
slice_max(n, n = 50)
wordcloud(tokens50$word, tokens50$n,
colors = brewer.pal(8, "Set2"))
Repeat question 3, but this time tokenize into bi-grams. How does the result change if you look at tri-grams? (You don’t need to create the word clouds.)
# start with any of the stop words
sw_start <- paste0("^", paste(stopwords("english"), collapse = " |^"), " ")
# end with any of the stop words
sw_end <- paste0(" ", paste(stopwords("english"), collapse = "$| "), "$")
tokens_bigram <- mt_samples |>
select(transcription) |>
unnest_tokens(ngram, transcription, token = "ngrams", n = 2) |>
filter(
# remove those with stop words
!grepl(sw_start, ngram, ignore.case = TRUE),
!grepl(sw_end, ngram, ignore.case = TRUE)
# remove numbers
) |>
count(ngram)
tokens_bigram |>
slice_max(n, n = 20) |>
ggplot(aes(reorder(ngram, n), n)) +
theme_minimal() +
labs(y = NULL, x = NULL) +
geom_bar(stat = "identity") +
coord_flip()
Using the results you got from question 4. Pick a word and count the words that appears after or before it. (Also, try finding pairs of words after and before the word.)
library(stringr)
# e.g., patient
tokens_bigram |>
filter(str_detect(ngram, regex("\\spatient$|^patient\\s"))) |>
mutate(word = str_remove(ngram, "patient"),
word = str_remove_all(word, " ")) |>
group_by(word) |>
summarise(n = sum(n)) |>
slice_max(n, n = 50) |>
ggplot(aes(reorder(word, n), n)) +
theme_minimal() +
labs(y = NULL, x = NULL) +
geom_bar(stat = "identity") +
coord_flip()
Which words are most used in each of the specialties. you can use
group_by()
and slice_max()
from
dplyr
to have the calculations be done within each
specialty. Remember to remove stop words. How about the most 5 used
words?
Find your own insight in the data:
Ideas:
tf_idf_by_specialty <- mt_samples |>
unnest_tokens(word, transcription) |>
filter(
!word %in% stopwords("english")
# try removing numbers
) |>
count(word, medical_specialty) |>
bind_tf_idf(word, medical_specialty, n)
tf_idf_by_specialty |>
group_by(medical_specialty) |>
slice_max(tf_idf, n = 5) |>
filter(
medical_specialty %in% c("Surgery", "Dentistry", "Allergy / Immunology")
) |>
ggplot(aes(reorder(word, tf_idf), tf_idf)) +
theme_minimal() +
labs(y = NULL, x = NULL) +
geom_bar(stat = "identity") +
coord_flip() +
facet_wrap(~ medical_specialty, scales = "free_y")
mt_samples |>
filter(str_detect(transcription, " elevator "))
## # A tibble: 65 × 3
## description medic…¹ trans…²
## <chr> <chr> <chr>
## 1 Surgical removal of completely bony impacted teeth #1, #16, … Dentis… PREOPE…
## 2 Total hip arthroplasty on the left. Left hip degenerative a… Surgery PREOPE…
## 3 Tonsillectomy, uvulopalatopharyngoplasty, and septoplasty fo… Surgery PREOPE…
## 4 Left mesothelioma, focal. Left anterior pleural-based nodul… Surgery PREOPE…
## 5 Extraction of tooth #T and incision and drainage (I&D) of ri… Surgery PREOPE…
## 6 Extraction of teeth #2 and #19 and incision and drainage (I&… Surgery PREOPE…
## 7 Right argon laser assisted stapedectomy. Bilateral conducti… Surgery PREOPE…
## 8 Open reduction and internal plate and screw fixation of depr… Surgery PREOPE…
## 9 Laparoscopic right salpingooophorectomy. Right pelvic pain … Surgery PREOPE…
## 10 Revision rhinoplasty and left conchal cartilage harvest to c… Surgery PREOPE…
## # … with 55 more rows, and abbreviated variable names ¹medical_specialty,
## # ²transcription
sentiment_list <- get_sentiments("bing")
sentiments_in_med <- tf_idf_by_specialty |>
left_join(sentiment_list, by = "word")
sentiments_in_med_by_sp <- sentiments_in_med |>
group_by(medical_specialty) |>
summarise(
n_positive = sum(ifelse(sentiment == "positive", n, 0), na.rm = TRUE),
n_negative = sum(ifelse(sentiment == "negative", n, 0), na.rm = TRUE),
n = sum(n)
)
sentiments_in_med_by_sp |>
ggplot(aes(reorder(medical_specialty, (n_negative + n_positive) / n))) +
theme_minimal() +
geom_col(aes(y = - n_negative / n), fill = "pink") +
geom_col(aes(y = n_positive / n), fill = "darkgreen") +
labs(x = NULL, y = NULL) +
coord_flip()