knitr::opts_chunk$set()

Learning goals

Lab description

For this lab we will be working with the medical record transcriptions from https://www.mtsamples.com/. And is loaded and “fairly” cleaned at https://github.com/JSC370/jsc370-2023/blob/main/data/medical_transcriptions/.

Setup packages

You should load in dplyr, (or data.table if you want to work that way), ggplot2 and tidytext. If you don’t already have tidytext then you can install with

install.packages("tidytext")
install.packages("stopwords")

Read in Medical Transcriptions

Loading in reference transcription samples from https://www.mtsamples.com/

library(tidytext)
library(readr)
library(dplyr)
library(tidyr)
library(ggplot2)
data_url <- paste0(
  "https://raw.githubusercontent.com/JSC370/",
  "jsc370-2023/main/data/medical_transcriptions/mtsamples.csv"
  )
mt_samples <- read_csv(data_url)
mt_samples <- mt_samples |>
  select(description, medical_specialty, transcription)
head(mt_samples)
## # A tibble: 6 × 3
##   description                                                    medic…¹ trans…²
##   <chr>                                                          <chr>   <chr>  
## 1 A 23-year-old white female presents with complaint of allergi… Allerg… "SUBJE…
## 2 Consult for laparoscopic gastric bypass.                       Bariat… "PAST …
## 3 Consult for laparoscopic gastric bypass.                       Bariat… "HISTO…
## 4 2-D M-Mode. Doppler.                                           Cardio… "2-D M…
## 5 2-D Echocardiogram                                             Cardio… "1.  T…
## 6 Morbid obesity.  Laparoscopic antecolic antegastric Roux-en-Y… Bariat… "PREOP…
## # … with abbreviated variable names ¹​medical_specialty, ²​transcription

Question 1: What specialties do we have?

We can use count() from dplyr to figure out how many different categories we have. Are these categories related? overlapping? evenly distributed?

mt_samples |>
  count(medical_specialty, sort = TRUE) |>
  mutate(medical_specialty = forcats::fct_reorder(medical_specialty, n)) |>
  ggplot(aes(medical_specialty, n)) + 
  theme_minimal() +
  labs(y = NULL, x = NULL) +
  geom_col() + 
  coord_flip()


Question 2

  • Tokenize the the words in the transcription column
  • Count the number of times each token appears
  • Visualize the top 20 most frequent words

Explain what we see from this result. Does it makes sense? What insights (if any) do we get?

tokens <- mt_samples |>
  select(transcription) |>
  unnest_tokens(word, transcription) |>
  # group_by(word) |>
  # summarise(n = n()) # same as `count(word)`
  count(word)

tokens |>
  # arrange(across(n, desc)) |>
  # head(20) |> # same as `slice_max(n, n = 20)`
  slice_max(n, n = 20) |>
  ggplot(aes(reorder(word, n), n)) +
  theme_minimal() +
  labs(y = NULL, x = NULL) +
  geom_bar(stat = "identity") +
  coord_flip()


Question 3

  • Redo visualization for the top 20 most frequent words after removing stop words
  • Bonus points if you remove numbers as well

What do we see know that we have removed stop words? Does it give us a better idea of what the text is about?

library(stopwords)
head(stopwords("english"))
## [1] "i"      "me"     "my"     "myself" "we"     "our"
length(stopwords("english"))
## [1] 175
tokens_no_stopwords <- tokens |>
  filter(
    !(word %in% stopwords("english")),
    !grepl("[[:digit:]]+", word)
  ) 

tokens_no_stopwords |>
  slice_max(n, n = 20) |>
  ggplot(aes(reorder(word, n), n)) +
  theme_minimal() +
  labs(y = NULL, x = NULL) +
  geom_bar(stat = "identity") +
  coord_flip()

Another method for visualizing word counts is using a word cloud via wordcloud::wordcloud(). Create a world cloud for the top 50 most frequent words after removing stop words (and numbers).

library(wordcloud)
## Loading required package: RColorBrewer
tokens50 <- tokens_no_stopwords |>
  slice_max(n, n = 50)
wordcloud(tokens50$word, tokens50$n, 
          colors = brewer.pal(8, "Set2"))


Question 4

Repeat question 3, but this time tokenize into bi-grams. How does the result change if you look at tri-grams? (You don’t need to create the word clouds.)

# start with any of the stop words
sw_start <- paste0("^", paste(stopwords("english"), collapse = " |^"), " ")
# end with any of the stop words
sw_end <- paste0(" ", paste(stopwords("english"), collapse = "$| "), "$")
# 2-gram
tokens_bigram <- mt_samples |>
  select(transcription) |>
  unnest_tokens(ngram, transcription, token = "ngrams", n = 2) |>
  filter(
    # remove those with stop words
    !grepl(sw_start, ngram, ignore.case = TRUE),
    !grepl(sw_end, ngram, ignore.case = TRUE),
    !grepl("\\d", ngram) # remove numbers
  ) |>
  count(ngram)
# 3-gram
tokens_trigram <- mt_samples |>
  select(transcription) |>
  unnest_tokens(ngram, transcription, token = "ngrams", n = 3) |>
  filter(
    # remove those with stop words
    !grepl(sw_start, ngram, ignore.case = TRUE),
    !grepl(sw_end, ngram, ignore.case = TRUE),
    !grepl("^\\d|\\d$", ngram) # remove if end or start with a number
  ) |>
  count(ngram)
library(stringr) # if you are more comfortable with str_*()
tokens_trigram <- mt_samples |>
  select(transcription) |>
  unnest_tokens(ngram, transcription, token = "ngrams", n = 3) |>
  filter(
    !str_detect(ngram, regex(sw_start, ignore_case = TRUE)),
    !str_detect(ngram, regex(sw_end, ignore_case = TRUE)),
    !str_detect(ngram, "\\d")
  ) |>
  count(ngram)
tokens_bigram |>
  slice_max(n, n = 20) |>
  ggplot(aes(reorder(ngram, n), n)) +
  theme_minimal() +
  labs(y = NULL, x = NULL) +
  geom_bar(stat = 'identity') +
  coord_flip()
tokens_trigram |>
  slice_max(n, n = 20) |>
  ggplot(aes(reorder(ngram, n), n)) +
  theme_minimal() +
  labs(y = NULL, x = NULL) +
  geom_bar(stat = 'identity') +
  coord_flip()

The word clouds aren’t as helpful as the word cloud with “words”.

tokens10_bigram <- tokens_bigram |>
  arrange(across(n, desc)) |>
  head(10)
tokens10_trigram <- tokens_trigram |>
  arrange(across(n, desc)) |>
  head(10)
wordcloud(tokens10_bigram$ngram, tokens10_bigram$n)
wordcloud(tokens10_trigram$ngram, tokens10_trigram$n)
## Warning in wordcloud(tokens10_trigram$ngram, tokens10_trigram$n): prepped and
## draped could not be fit on page. It will not be plotted.
## Warning in wordcloud(tokens10_trigram$ngram, tokens10_trigram$n): tolerated the
## procedure could not be fit on page. It will not be plotted.


Question 5

Using the results you got from question 4. Pick a word and count the words that appears after or before it.

library(stringr)
# e.g., words that come before or after patient
tokens_bigram |>
  filter(str_detect(ngram, regex("\\spatient$|^patient\\s"))) |>
  mutate(word = str_remove(ngram, "patient"),
         word = str_remove_all(word, " ")) |>
  group_by(word) |>
  summarise(n = sum(n)) |>
  slice_max(n, n = 50) |>
  ggplot(aes(reorder(word, n), n)) +
  theme_minimal() +
  labs(y = NULL, x = NULL) +
  geom_bar(stat = "identity") +
  coord_flip()

# e.g., pairs of words that come before and after patient
tokens_trigram |>
  filter(str_detect(ngram, regex("\\spatient\\s"))) |>
  mutate(word = str_replace(ngram, "patient", "---")) |>
  group_by(word) |>
  summarise(n = sum(n)) |>
  slice_max(n, n = 50) |>
  ggplot(aes(reorder(word, n), n)) +
  theme_minimal() +
  labs(y = NULL, x = NULL) +
  geom_bar(stat = "identity") +
  coord_flip()


Question 6

Which words are most used in each of the specialties. you can use group_by() and top_n() slice_max() from dplyr to have the calculations be done within each specialty. Remember to remove stop words. How about the most 5 used words?

mt_samples |>
  unnest_tokens(word, transcription) |>
  filter(
    !word %in% stopwords("english"),
    !grepl("\\d", word)
  ) |>
  group_by(medical_specialty) |>
  count(word) |>
  slice_max(n, n = 1) |> # ties will appear
  knitr::kable()
medical_specialty word n
Allergy / Immunology allergies 4
Allergy / Immunology used 4
Bariatrics patient 29
Cardiovascular / Pulmonary normal 25
Dentistry removed 10
Gastroenterology patient 835
General Medicine patient 1356
Hematology - Oncology patient 316
Hospice - Palliative Care patient 43
IME-QME-Work Comp etc. pain 152
Lab Medicine - Pathology cm 35
Lab Medicine - Pathology tumor 35
Letters pain 80
Nephrology patient 348
Neurology right 694
Neurosurgery patient 374
Obstetrics / Gynecology patient 628
Office Notes normal 230
Ophthalmology eye 456
Orthopedic patient 1711
Pain Management patient 236
Pediatrics - Neonatal patient 247
Physical Medicine - Rehab patient 220
Podiatry foot 232
Psychiatry / Psychology patient 532
Radiology left 701
Rheumatology history 50
SOAP / Chart / Progress Notes patient 537
Sleep Medicine sleep 143
Speech - Language patient 105
Surgery patient 4855
Urology patient 776
mt_samples |>
  unnest_tokens(word, transcription) |>
  filter(
    !word %in% stopwords("english"),
    !grepl("\\d", word)
  ) |>
  group_by(medical_specialty) |>
  count(word) |>
  slice_max(n, n = 5) |> # ties will appear
  knitr::kable()
medical_specialty word n
Allergy / Immunology allergies 4
Allergy / Immunology used 4
Allergy / Immunology allegra 3
Allergy / Immunology clear 3
Allergy / Immunology prescription 3
Allergy / Immunology sprays 3
Bariatrics patient 29
Bariatrics history 21
Bariatrics placed 21
Bariatrics abdomen 20
Bariatrics procedure 18
Cardiovascular / Pulmonary normal 25
Cardiovascular / Pulmonary left 18
Cardiovascular / Pulmonary trachea 17
Cardiovascular / Pulmonary valve 15
Cardiovascular / Pulmonary aortic 12
Dentistry removed 10
Dentistry elevator 8
Dentistry tooth 7
Dentistry area 4
Dentistry aspect 4
Dentistry blade 4
Dentistry bone 4
Dentistry buccal 4
Dentistry bur 4
Dentistry closed 4
Dentistry envelope 4
Dentistry flap 4
Dentistry follicle 4
Dentistry gut 4
Dentistry incision 4
Dentistry irrigated 4
Dentistry made 4
Dentistry periosteal 4
Dentistry procedure 4
Dentistry raised 4
Dentistry remnants 4
Gastroenterology patient 835
Gastroenterology procedure 455
Gastroenterology history 324
Gastroenterology normal 318
Gastroenterology placed 287
General Medicine patient 1356
General Medicine history 1027
General Medicine normal 717
General Medicine pain 567
General Medicine mg 503
Hematology - Oncology patient 316
Hematology - Oncology history 290
Hematology - Oncology right 196
Hematology - Oncology left 187
Hematology - Oncology well 120
Hospice - Palliative Care patient 43
Hospice - Palliative Care mg 28
Hospice - Palliative Care history 27
Hospice - Palliative Care daughter 22
Hospice - Palliative Care family 19
Hospice - Palliative Care pain 19
IME-QME-Work Comp etc. pain 152
IME-QME-Work Comp etc. right 131
IME-QME-Work Comp etc. patient 106
IME-QME-Work Comp etc. back 105
IME-QME-Work Comp etc. dr 82
Lab Medicine - Pathology cm 35
Lab Medicine - Pathology tumor 35
Lab Medicine - Pathology x 32
Lab Medicine - Pathology right 31
Lab Medicine - Pathology lymph 30
Letters pain 80
Letters abc 71
Letters patient 65
Letters normal 53
Letters mr 52
Nephrology patient 348
Nephrology renal 257
Nephrology right 243
Nephrology history 160
Nephrology kidney 144
Neurology right 694
Neurology left 672
Neurology patient 648
Neurology normal 485
Neurology history 429
Neurosurgery patient 374
Neurosurgery placed 282
Neurosurgery right 260
Neurosurgery procedure 247
Neurosurgery left 222
Obstetrics / Gynecology patient 628
Obstetrics / Gynecology placed 350
Obstetrics / Gynecology uterus 317
Obstetrics / Gynecology procedure 301
Obstetrics / Gynecology incision 293
Office Notes normal 230
Office Notes negative 193
Office Notes without 99
Office Notes patient 94
Office Notes history 76
Ophthalmology eye 456
Ophthalmology patient 258
Ophthalmology right 192
Ophthalmology procedure 176
Ophthalmology placed 168
Orthopedic patient 1711
Orthopedic right 1172
Orthopedic left 998
Orthopedic placed 924
Orthopedic pain 763
Pain Management patient 236
Pain Management procedure 197
Pain Management needle 156
Pain Management placed 99
Pain Management injected 76
Pain Management pain 76
Pediatrics - Neonatal patient 247
Pediatrics - Neonatal history 235
Pediatrics - Neonatal well 159
Pediatrics - Neonatal normal 155
Pediatrics - Neonatal old 93
Physical Medicine - Rehab patient 220
Physical Medicine - Rehab right 107
Physical Medicine - Rehab left 104
Physical Medicine - Rehab pain 95
Physical Medicine - Rehab motor 62
Podiatry foot 232
Podiatry patient 231
Podiatry right 156
Podiatry left 137
Podiatry tendon 98
Psychiatry / Psychology patient 532
Psychiatry / Psychology history 344
Psychiatry / Psychology mg 183
Psychiatry / Psychology mother 164
Psychiatry / Psychology reported 141
Radiology left 701
Radiology normal 644
Radiology right 644
Radiology patient 304
Radiology exam 302
Rheumatology history 50
Rheumatology patient 34
Rheumatology mg 26
Rheumatology pain 23
Rheumatology day 22
Rheumatology examination 22
Rheumatology joints 22
SOAP / Chart / Progress Notes patient 537
SOAP / Chart / Progress Notes mg 302
SOAP / Chart / Progress Notes history 254
SOAP / Chart / Progress Notes pain 239
SOAP / Chart / Progress Notes well 230
Sleep Medicine sleep 143
Sleep Medicine patient 69
Sleep Medicine apnea 35
Sleep Medicine activity 31
Sleep Medicine stage 29
Speech - Language patient 105
Speech - Language therapy 41
Speech - Language speech 35
Speech - Language patient’s 28
Speech - Language able 26
Surgery patient 4855
Surgery left 3263
Surgery right 3261
Surgery procedure 3243
Surgery placed 3025
Urology patient 776
Urology bladder 357
Urology right 307
Urology procedure 306
Urology placed 305

Question 7 - extra

Find your own insight in the data:

Ideas:

tf_idf_by_specialty <- mt_samples |>
  unnest_tokens(word, transcription) |>
  filter(
    !word %in% stopwords("english"),
    !grepl("[[:digit:]]+", word)
  ) |>
  count(word, medical_specialty) |>
  bind_tf_idf(word, medical_specialty, n)
tf_idf_by_specialty |>
  group_by(medical_specialty) |>
  slice_max(tf_idf, n = 3) |>
  filter(medical_specialty %in% c("Surgery", "Dentistry", "Allergy / Immunology")) |>
  ggplot(aes(reorder(word, tf_idf), tf_idf)) +
  theme_minimal() +
  labs(y = NULL, x = NULL) +
  geom_bar(stat = 'identity') +
  coord_flip() +
  facet_wrap(~ medical_specialty, scales = "free_y")

sentiment_list <- get_sentiments("bing")
sentiments_in_med <- tf_idf_by_specialty |>
  left_join(sentiment_list, by = "word") |>
  group_by(medical_specialty) |>
  summarise(
    n_positive = sum(if_else(sentiment == "positive", n, 0), na.rm = TRUE),
    n_negative = sum(if_else(sentiment == "negative", n, 0), na.rm = TRUE) ,
    n = sum(n)
  )
sentiments_in_med |>
  mutate(medical_specialty = reorder(
    medical_specialty, (n_positive + n_negative) / n)) |>
  ggplot(aes(medical_specialty)) +
  theme_minimal() +
  geom_col(aes(y = -n_negative / n), fill = "pink") +
  geom_col(aes(y = n_positive / n), fill = "darkgreen") +
  labs(x = NULL, y = NULL) +
  coord_flip()

library(ggridges)
mt_samples |>
  select(medical_specialty, transcription) |>
  unnest_tokens(sentence, transcription, token = "sentences") |>
  mutate(n_words = sapply(tokenizers::tokenize_words(sentence), length)) |>
  ggplot() +
  theme_minimal() +
  geom_density_ridges(
    aes(y = reorder(medical_specialty, n_words, max), 
        x = n_words, group = medical_specialty),
    bandwidth = .5, 
    jittered_points = TRUE, point_alpha = .1,
    position = position_raincloud(height = .3, ygap = 0)
  ) +
  labs(x = "Number of words in a sentence.", y = NULL) 

# Runshi' version using `stringr`
#tokenize in sentences 
tokens <- mt_samples |> select(transcription, medical_specialty) |> 
  unnest_tokens(sentence, transcription, token = "sentences")

#create summary table of sentence length for each specicalty
tokens |>
  mutate(length = stringr::str_count(sentence, "\\w+"))  |>
  filter(length > 0 & !is.na(length)) |>
  group_by(medical_specialty) |>
  summarise(
    average_length = mean(length),
    max_length = max(length),
    min_length = min(length), 
    n = n()
  ) |>
  ggplot(aes(x = average_length, y = reorder(medical_specialty, average_length))) +
  theme_minimal() +
  geom_point() +
  geom_segment(aes(xend = 0, yend = reorder(medical_specialty, average_length))) +
  labs(y = NULL, x = "Mean number of words in a sentence")

# Deliverables

  1. Questions 1-7 answered, pdf or html output uploaded to Quercus