knitr::opts_chunk$set()
unnest_tokens()
and unnest_ngrams()
to
extract tokens and ngrams from text.For this lab we will be working with the medical record transcriptions from https://www.mtsamples.com/. And is loaded and “fairly” cleaned at https://github.com/JSC370/jsc370-2023/blob/main/data/medical_transcriptions/.
You should load in dplyr
, (or data.table
if
you want to work that way), ggplot2
and
tidytext
. If you don’t already have tidytext
then you can install with
install.packages("tidytext")
install.packages("stopwords")
Loading in reference transcription samples from https://www.mtsamples.com/
library(tidytext)
library(readr)
library(dplyr)
library(tidyr)
library(ggplot2)
data_url <- paste0(
"https://raw.githubusercontent.com/JSC370/",
"jsc370-2023/main/data/medical_transcriptions/mtsamples.csv"
)
mt_samples <- read_csv(data_url)
mt_samples <- mt_samples |>
select(description, medical_specialty, transcription)
head(mt_samples)
## # A tibble: 6 × 3
## description medic…¹ trans…²
## <chr> <chr> <chr>
## 1 A 23-year-old white female presents with complaint of allergi… Allerg… "SUBJE…
## 2 Consult for laparoscopic gastric bypass. Bariat… "PAST …
## 3 Consult for laparoscopic gastric bypass. Bariat… "HISTO…
## 4 2-D M-Mode. Doppler. Cardio… "2-D M…
## 5 2-D Echocardiogram Cardio… "1. T…
## 6 Morbid obesity. Laparoscopic antecolic antegastric Roux-en-Y… Bariat… "PREOP…
## # … with abbreviated variable names ¹medical_specialty, ²transcription
We can use count()
from dplyr
to figure out
how many different categories we have. Are these categories related?
overlapping? evenly distributed?
mt_samples |>
count(medical_specialty, sort = TRUE) |>
mutate(medical_specialty = forcats::fct_reorder(medical_specialty, n)) |>
ggplot(aes(medical_specialty, n)) +
theme_minimal() +
labs(y = NULL, x = NULL) +
geom_col() +
coord_flip()
transcription
columnExplain what we see from this result. Does it makes sense? What insights (if any) do we get?
tokens <- mt_samples |>
select(transcription) |>
unnest_tokens(word, transcription) |>
# group_by(word) |>
# summarise(n = n()) # same as `count(word)`
count(word)
tokens |>
# arrange(across(n, desc)) |>
# head(20) |> # same as `slice_max(n, n = 20)`
slice_max(n, n = 20) |>
ggplot(aes(reorder(word, n), n)) +
theme_minimal() +
labs(y = NULL, x = NULL) +
geom_bar(stat = "identity") +
coord_flip()
What do we see know that we have removed stop words? Does it give us a better idea of what the text is about?
library(stopwords)
head(stopwords("english"))
## [1] "i" "me" "my" "myself" "we" "our"
length(stopwords("english"))
## [1] 175
tokens_no_stopwords <- tokens |>
filter(
!(word %in% stopwords("english")),
!grepl("[[:digit:]]+", word)
)
tokens_no_stopwords |>
slice_max(n, n = 20) |>
ggplot(aes(reorder(word, n), n)) +
theme_minimal() +
labs(y = NULL, x = NULL) +
geom_bar(stat = "identity") +
coord_flip()
Another method for visualizing word counts is using a word cloud via
wordcloud::wordcloud()
. Create a world cloud for the top 50
most frequent words after removing stop words (and numbers).
library(wordcloud)
## Loading required package: RColorBrewer
tokens50 <- tokens_no_stopwords |>
slice_max(n, n = 50)
wordcloud(tokens50$word, tokens50$n,
colors = brewer.pal(8, "Set2"))
Repeat question 3, but this time tokenize into bi-grams. How does the result change if you look at tri-grams? (You don’t need to create the word clouds.)
# start with any of the stop words
sw_start <- paste0("^", paste(stopwords("english"), collapse = " |^"), " ")
# end with any of the stop words
sw_end <- paste0(" ", paste(stopwords("english"), collapse = "$| "), "$")
# 2-gram
tokens_bigram <- mt_samples |>
select(transcription) |>
unnest_tokens(ngram, transcription, token = "ngrams", n = 2) |>
filter(
# remove those with stop words
!grepl(sw_start, ngram, ignore.case = TRUE),
!grepl(sw_end, ngram, ignore.case = TRUE),
!grepl("\\d", ngram) # remove numbers
) |>
count(ngram)
# 3-gram
tokens_trigram <- mt_samples |>
select(transcription) |>
unnest_tokens(ngram, transcription, token = "ngrams", n = 3) |>
filter(
# remove those with stop words
!grepl(sw_start, ngram, ignore.case = TRUE),
!grepl(sw_end, ngram, ignore.case = TRUE),
!grepl("^\\d|\\d$", ngram) # remove if end or start with a number
) |>
count(ngram)
library(stringr) # if you are more comfortable with str_*()
tokens_trigram <- mt_samples |>
select(transcription) |>
unnest_tokens(ngram, transcription, token = "ngrams", n = 3) |>
filter(
!str_detect(ngram, regex(sw_start, ignore_case = TRUE)),
!str_detect(ngram, regex(sw_end, ignore_case = TRUE)),
!str_detect(ngram, "\\d")
) |>
count(ngram)
tokens_bigram |>
slice_max(n, n = 20) |>
ggplot(aes(reorder(ngram, n), n)) +
theme_minimal() +
labs(y = NULL, x = NULL) +
geom_bar(stat = 'identity') +
coord_flip()
tokens_trigram |>
slice_max(n, n = 20) |>
ggplot(aes(reorder(ngram, n), n)) +
theme_minimal() +
labs(y = NULL, x = NULL) +
geom_bar(stat = 'identity') +
coord_flip()
The word clouds aren’t as helpful as the word cloud with “words”.
tokens10_bigram <- tokens_bigram |>
arrange(across(n, desc)) |>
head(10)
tokens10_trigram <- tokens_trigram |>
arrange(across(n, desc)) |>
head(10)
wordcloud(tokens10_bigram$ngram, tokens10_bigram$n)
wordcloud(tokens10_trigram$ngram, tokens10_trigram$n)
## Warning in wordcloud(tokens10_trigram$ngram, tokens10_trigram$n): prepped and
## draped could not be fit on page. It will not be plotted.
## Warning in wordcloud(tokens10_trigram$ngram, tokens10_trigram$n): tolerated the
## procedure could not be fit on page. It will not be plotted.
Using the results you got from question 4. Pick a word and count the words that appears after or before it.
library(stringr)
# e.g., words that come before or after patient
tokens_bigram |>
filter(str_detect(ngram, regex("\\spatient$|^patient\\s"))) |>
mutate(word = str_remove(ngram, "patient"),
word = str_remove_all(word, " ")) |>
group_by(word) |>
summarise(n = sum(n)) |>
slice_max(n, n = 50) |>
ggplot(aes(reorder(word, n), n)) +
theme_minimal() +
labs(y = NULL, x = NULL) +
geom_bar(stat = "identity") +
coord_flip()
# e.g., pairs of words that come before and after patient
tokens_trigram |>
filter(str_detect(ngram, regex("\\spatient\\s"))) |>
mutate(word = str_replace(ngram, "patient", "---")) |>
group_by(word) |>
summarise(n = sum(n)) |>
slice_max(n, n = 50) |>
ggplot(aes(reorder(word, n), n)) +
theme_minimal() +
labs(y = NULL, x = NULL) +
geom_bar(stat = "identity") +
coord_flip()
Which words are most used in each of the specialties. you can use
group_by()
and top_n()
slice_max()
from dplyr
to have the
calculations be done within each specialty. Remember to remove stop
words. How about the most 5 used words?
mt_samples |>
unnest_tokens(word, transcription) |>
filter(
!word %in% stopwords("english"),
!grepl("\\d", word)
) |>
group_by(medical_specialty) |>
count(word) |>
slice_max(n, n = 1) |> # ties will appear
knitr::kable()
medical_specialty | word | n |
---|---|---|
Allergy / Immunology | allergies | 4 |
Allergy / Immunology | used | 4 |
Bariatrics | patient | 29 |
Cardiovascular / Pulmonary | normal | 25 |
Dentistry | removed | 10 |
Gastroenterology | patient | 835 |
General Medicine | patient | 1356 |
Hematology - Oncology | patient | 316 |
Hospice - Palliative Care | patient | 43 |
IME-QME-Work Comp etc. | pain | 152 |
Lab Medicine - Pathology | cm | 35 |
Lab Medicine - Pathology | tumor | 35 |
Letters | pain | 80 |
Nephrology | patient | 348 |
Neurology | right | 694 |
Neurosurgery | patient | 374 |
Obstetrics / Gynecology | patient | 628 |
Office Notes | normal | 230 |
Ophthalmology | eye | 456 |
Orthopedic | patient | 1711 |
Pain Management | patient | 236 |
Pediatrics - Neonatal | patient | 247 |
Physical Medicine - Rehab | patient | 220 |
Podiatry | foot | 232 |
Psychiatry / Psychology | patient | 532 |
Radiology | left | 701 |
Rheumatology | history | 50 |
SOAP / Chart / Progress Notes | patient | 537 |
Sleep Medicine | sleep | 143 |
Speech - Language | patient | 105 |
Surgery | patient | 4855 |
Urology | patient | 776 |
mt_samples |>
unnest_tokens(word, transcription) |>
filter(
!word %in% stopwords("english"),
!grepl("\\d", word)
) |>
group_by(medical_specialty) |>
count(word) |>
slice_max(n, n = 5) |> # ties will appear
knitr::kable()
medical_specialty | word | n |
---|---|---|
Allergy / Immunology | allergies | 4 |
Allergy / Immunology | used | 4 |
Allergy / Immunology | allegra | 3 |
Allergy / Immunology | clear | 3 |
Allergy / Immunology | prescription | 3 |
Allergy / Immunology | sprays | 3 |
Bariatrics | patient | 29 |
Bariatrics | history | 21 |
Bariatrics | placed | 21 |
Bariatrics | abdomen | 20 |
Bariatrics | procedure | 18 |
Cardiovascular / Pulmonary | normal | 25 |
Cardiovascular / Pulmonary | left | 18 |
Cardiovascular / Pulmonary | trachea | 17 |
Cardiovascular / Pulmonary | valve | 15 |
Cardiovascular / Pulmonary | aortic | 12 |
Dentistry | removed | 10 |
Dentistry | elevator | 8 |
Dentistry | tooth | 7 |
Dentistry | area | 4 |
Dentistry | aspect | 4 |
Dentistry | blade | 4 |
Dentistry | bone | 4 |
Dentistry | buccal | 4 |
Dentistry | bur | 4 |
Dentistry | closed | 4 |
Dentistry | envelope | 4 |
Dentistry | flap | 4 |
Dentistry | follicle | 4 |
Dentistry | gut | 4 |
Dentistry | incision | 4 |
Dentistry | irrigated | 4 |
Dentistry | made | 4 |
Dentistry | periosteal | 4 |
Dentistry | procedure | 4 |
Dentistry | raised | 4 |
Dentistry | remnants | 4 |
Gastroenterology | patient | 835 |
Gastroenterology | procedure | 455 |
Gastroenterology | history | 324 |
Gastroenterology | normal | 318 |
Gastroenterology | placed | 287 |
General Medicine | patient | 1356 |
General Medicine | history | 1027 |
General Medicine | normal | 717 |
General Medicine | pain | 567 |
General Medicine | mg | 503 |
Hematology - Oncology | patient | 316 |
Hematology - Oncology | history | 290 |
Hematology - Oncology | right | 196 |
Hematology - Oncology | left | 187 |
Hematology - Oncology | well | 120 |
Hospice - Palliative Care | patient | 43 |
Hospice - Palliative Care | mg | 28 |
Hospice - Palliative Care | history | 27 |
Hospice - Palliative Care | daughter | 22 |
Hospice - Palliative Care | family | 19 |
Hospice - Palliative Care | pain | 19 |
IME-QME-Work Comp etc. | pain | 152 |
IME-QME-Work Comp etc. | right | 131 |
IME-QME-Work Comp etc. | patient | 106 |
IME-QME-Work Comp etc. | back | 105 |
IME-QME-Work Comp etc. | dr | 82 |
Lab Medicine - Pathology | cm | 35 |
Lab Medicine - Pathology | tumor | 35 |
Lab Medicine - Pathology | x | 32 |
Lab Medicine - Pathology | right | 31 |
Lab Medicine - Pathology | lymph | 30 |
Letters | pain | 80 |
Letters | abc | 71 |
Letters | patient | 65 |
Letters | normal | 53 |
Letters | mr | 52 |
Nephrology | patient | 348 |
Nephrology | renal | 257 |
Nephrology | right | 243 |
Nephrology | history | 160 |
Nephrology | kidney | 144 |
Neurology | right | 694 |
Neurology | left | 672 |
Neurology | patient | 648 |
Neurology | normal | 485 |
Neurology | history | 429 |
Neurosurgery | patient | 374 |
Neurosurgery | placed | 282 |
Neurosurgery | right | 260 |
Neurosurgery | procedure | 247 |
Neurosurgery | left | 222 |
Obstetrics / Gynecology | patient | 628 |
Obstetrics / Gynecology | placed | 350 |
Obstetrics / Gynecology | uterus | 317 |
Obstetrics / Gynecology | procedure | 301 |
Obstetrics / Gynecology | incision | 293 |
Office Notes | normal | 230 |
Office Notes | negative | 193 |
Office Notes | without | 99 |
Office Notes | patient | 94 |
Office Notes | history | 76 |
Ophthalmology | eye | 456 |
Ophthalmology | patient | 258 |
Ophthalmology | right | 192 |
Ophthalmology | procedure | 176 |
Ophthalmology | placed | 168 |
Orthopedic | patient | 1711 |
Orthopedic | right | 1172 |
Orthopedic | left | 998 |
Orthopedic | placed | 924 |
Orthopedic | pain | 763 |
Pain Management | patient | 236 |
Pain Management | procedure | 197 |
Pain Management | needle | 156 |
Pain Management | placed | 99 |
Pain Management | injected | 76 |
Pain Management | pain | 76 |
Pediatrics - Neonatal | patient | 247 |
Pediatrics - Neonatal | history | 235 |
Pediatrics - Neonatal | well | 159 |
Pediatrics - Neonatal | normal | 155 |
Pediatrics - Neonatal | old | 93 |
Physical Medicine - Rehab | patient | 220 |
Physical Medicine - Rehab | right | 107 |
Physical Medicine - Rehab | left | 104 |
Physical Medicine - Rehab | pain | 95 |
Physical Medicine - Rehab | motor | 62 |
Podiatry | foot | 232 |
Podiatry | patient | 231 |
Podiatry | right | 156 |
Podiatry | left | 137 |
Podiatry | tendon | 98 |
Psychiatry / Psychology | patient | 532 |
Psychiatry / Psychology | history | 344 |
Psychiatry / Psychology | mg | 183 |
Psychiatry / Psychology | mother | 164 |
Psychiatry / Psychology | reported | 141 |
Radiology | left | 701 |
Radiology | normal | 644 |
Radiology | right | 644 |
Radiology | patient | 304 |
Radiology | exam | 302 |
Rheumatology | history | 50 |
Rheumatology | patient | 34 |
Rheumatology | mg | 26 |
Rheumatology | pain | 23 |
Rheumatology | day | 22 |
Rheumatology | examination | 22 |
Rheumatology | joints | 22 |
SOAP / Chart / Progress Notes | patient | 537 |
SOAP / Chart / Progress Notes | mg | 302 |
SOAP / Chart / Progress Notes | history | 254 |
SOAP / Chart / Progress Notes | pain | 239 |
SOAP / Chart / Progress Notes | well | 230 |
Sleep Medicine | sleep | 143 |
Sleep Medicine | patient | 69 |
Sleep Medicine | apnea | 35 |
Sleep Medicine | activity | 31 |
Sleep Medicine | stage | 29 |
Speech - Language | patient | 105 |
Speech - Language | therapy | 41 |
Speech - Language | speech | 35 |
Speech - Language | patient’s | 28 |
Speech - Language | able | 26 |
Surgery | patient | 4855 |
Surgery | left | 3263 |
Surgery | right | 3261 |
Surgery | procedure | 3243 |
Surgery | placed | 3025 |
Urology | patient | 776 |
Urology | bladder | 357 |
Urology | right | 307 |
Urology | procedure | 306 |
Urology | placed | 305 |
Find your own insight in the data:
Ideas:
tf_idf_by_specialty <- mt_samples |>
unnest_tokens(word, transcription) |>
filter(
!word %in% stopwords("english"),
!grepl("[[:digit:]]+", word)
) |>
count(word, medical_specialty) |>
bind_tf_idf(word, medical_specialty, n)
tf_idf_by_specialty |>
group_by(medical_specialty) |>
slice_max(tf_idf, n = 3) |>
filter(medical_specialty %in% c("Surgery", "Dentistry", "Allergy / Immunology")) |>
ggplot(aes(reorder(word, tf_idf), tf_idf)) +
theme_minimal() +
labs(y = NULL, x = NULL) +
geom_bar(stat = 'identity') +
coord_flip() +
facet_wrap(~ medical_specialty, scales = "free_y")
sentiment_list <- get_sentiments("bing")
sentiments_in_med <- tf_idf_by_specialty |>
left_join(sentiment_list, by = "word") |>
group_by(medical_specialty) |>
summarise(
n_positive = sum(if_else(sentiment == "positive", n, 0), na.rm = TRUE),
n_negative = sum(if_else(sentiment == "negative", n, 0), na.rm = TRUE) ,
n = sum(n)
)
sentiments_in_med |>
mutate(medical_specialty = reorder(
medical_specialty, (n_positive + n_negative) / n)) |>
ggplot(aes(medical_specialty)) +
theme_minimal() +
geom_col(aes(y = -n_negative / n), fill = "pink") +
geom_col(aes(y = n_positive / n), fill = "darkgreen") +
labs(x = NULL, y = NULL) +
coord_flip()
library(ggridges)
mt_samples |>
select(medical_specialty, transcription) |>
unnest_tokens(sentence, transcription, token = "sentences") |>
mutate(n_words = sapply(tokenizers::tokenize_words(sentence), length)) |>
ggplot() +
theme_minimal() +
geom_density_ridges(
aes(y = reorder(medical_specialty, n_words, max),
x = n_words, group = medical_specialty),
bandwidth = .5,
jittered_points = TRUE, point_alpha = .1,
position = position_raincloud(height = .3, ygap = 0)
) +
labs(x = "Number of words in a sentence.", y = NULL)
# Runshi' version using `stringr`
#tokenize in sentences
tokens <- mt_samples |> select(transcription, medical_specialty) |>
unnest_tokens(sentence, transcription, token = "sentences")
#create summary table of sentence length for each specicalty
tokens |>
mutate(length = stringr::str_count(sentence, "\\w+")) |>
filter(length > 0 & !is.na(length)) |>
group_by(medical_specialty) |>
summarise(
average_length = mean(length),
max_length = max(length),
min_length = min(length),
n = n()
) |>
ggplot(aes(x = average_length, y = reorder(medical_specialty, average_length))) +
theme_minimal() +
geom_point() +
geom_segment(aes(xend = 0, yend = reorder(medical_specialty, average_length))) +
labs(y = NULL, x = "Mean number of words in a sentence")
# Deliverables