knitr::opts_chunk$set()

Learning goals

Use unnest_tokens() and unnest_ngrams() to extract tokens and ngrams from text.
Use dplyr and ggplot2 to analyze text data

Lab description

For this lab we will be working with the medical record transcriptions from https://www.mtsamples.com/. And is loaded and “fairly” cleaned at https://github.com/JSC370/jsc370-2023/blob/main/data/medical_transcriptions/.

Setup packages

You should load in dplyr, (or data.table if you want to work that way), ggplot2 and tidytext. If you don’t already have tidytext then you can install with

install.packages("tidytext")
install.packages("stopwords")

Read in Medical Transcriptions

Loading in reference transcription samples from https://www.mtsamples.com/

library(tidytext)
library(readr)
library(dplyr)
library(tidyr)
library(ggplot2)
data_url <- paste0(
  "https://raw.githubusercontent.com/JSC370/",
  "jsc370-2023/main/data/medical_transcriptions/mtsamples.csv"
  )
mt_samples <- read_csv(data_url)
mt_samples <- mt_samples |>
  select(description, medical_specialty, transcription)
head(mt_samples)

## # A tibble: 6 × 3
##   description                                                    medic…¹ trans…²
##   <chr>                                                          <chr>   <chr>  
## 1 A 23-year-old white female presents with complaint of allergi… Allerg… "SUBJE…
## 2 Consult for laparoscopic gastric bypass.                       Bariat… "PAST …
## 3 Consult for laparoscopic gastric bypass.                       Bariat… "HISTO…
## 4 2-D M-Mode. Doppler.                                           Cardio… "2-D M…
## 5 2-D Echocardiogram                                             Cardio… "1.  T…
## 6 Morbid obesity.  Laparoscopic antecolic antegastric Roux-en-Y… Bariat… "PREOP…
## # … with abbreviated variable names ¹medical_specialty, ²transcription

Question 1: What specialties do we have?

We can use count() from dplyr to figure out how many different categories we have. Are these categories related? overlapping? evenly distributed?

mt_samples |>
  count(medical_specialty, sort = TRUE) |>
  mutate(medical_specialty = forcats::fct_reorder(medical_specialty, n)) |>
  ggplot(aes(medical_specialty, n)) + 
  theme_minimal() +
  labs(y = NULL, x = NULL) +
  geom_col() + 
  coord_flip()

Question 2

Tokenize the the words in the transcription column
Count the number of times each token appears
Visualize the top 20 most frequent words

Explain what we see from this result. Does it makes sense? What insights (if any) do we get?

tokens <- mt_samples |>
  select(transcription) |>
  unnest_tokens(word, transcription) |>
  # group_by(word) |>
  # summarise(n = n()) # same as `count(word)`
  count(word)

tokens |>
  # arrange(across(n, desc)) |>
  # head(20) |> # same as `slice_max(n, n = 20)`
  slice_max(n, n = 20) |>
  ggplot(aes(reorder(word, n), n)) +
  theme_minimal() +
  labs(y = NULL, x = NULL) +
  geom_bar(stat = "identity") +
  coord_flip()

Question 3

Redo visualization for the top 20 most frequent words after removing stop words
Bonus points if you remove numbers as well

What do we see know that we have removed stop words? Does it give us a better idea of what the text is about?

library(stopwords)
head(stopwords("english"))

## [1] "i"      "me"     "my"     "myself" "we"     "our"

length(stopwords("english"))

## [1] 175

tokens_no_stopwords <- tokens |>
  filter(
    !(word %in% stopwords("english")),
    !grepl("[[:digit:]]+", word)
  ) 

tokens_no_stopwords |>
  slice_max(n, n = 20) |>
  ggplot(aes(reorder(word, n), n)) +
  theme_minimal() +
  labs(y = NULL, x = NULL) +
  geom_bar(stat = "identity") +
  coord_flip()

Another method for visualizing word counts is using a word cloud via wordcloud::wordcloud(). Create a world cloud for the top 50 most frequent words after removing stop words (and numbers).

library(wordcloud)

## Loading required package: RColorBrewer

tokens50 <- tokens_no_stopwords |>
  slice_max(n, n = 50)
wordcloud(tokens50$word, tokens50$n, 
          colors = brewer.pal(8, "Set2"))

Question 4

Repeat question 3, but this time tokenize into bi-grams. How does the result change if you look at tri-grams? (You don’t need to create the word clouds.)

# start with any of the stop words
sw_start <- paste0("^", paste(stopwords("english"), collapse = " |^"), " ")
# end with any of the stop words
sw_end <- paste0(" ", paste(stopwords("english"), collapse = "$| "), "$")
# 2-gram
tokens_bigram <- mt_samples |>
  select(transcription) |>
  unnest_tokens(ngram, transcription, token = "ngrams", n = 2) |>
  filter(
    # remove those with stop words
    !grepl(sw_start, ngram, ignore.case = TRUE),
    !grepl(sw_end, ngram, ignore.case = TRUE),
    !grepl("\\d", ngram) # remove numbers
  ) |>
  count(ngram)
# 3-gram
tokens_trigram <- mt_samples |>
  select(transcription) |>
  unnest_tokens(ngram, transcription, token = "ngrams", n = 3) |>
  filter(
    # remove those with stop words
    !grepl(sw_start, ngram, ignore.case = TRUE),
    !grepl(sw_end, ngram, ignore.case = TRUE),
    !grepl("^\\d|\\d$", ngram) # remove if end or start with a number
  ) |>
  count(ngram)

library(stringr) # if you are more comfortable with str_*()
tokens_trigram <- mt_samples |>
  select(transcription) |>
  unnest_tokens(ngram, transcription, token = "ngrams", n = 3) |>
  filter(
    !str_detect(ngram, regex(sw_start, ignore_case = TRUE)),
    !str_detect(ngram, regex(sw_end, ignore_case = TRUE)),
    !str_detect(ngram, "\\d")
  ) |>
  count(ngram)

tokens_bigram |>
  slice_max(n, n = 20) |>
  ggplot(aes(reorder(ngram, n), n)) +
  theme_minimal() +
  labs(y = NULL, x = NULL) +
  geom_bar(stat = 'identity') +
  coord_flip()
tokens_trigram |>
  slice_max(n, n = 20) |>
  ggplot(aes(reorder(ngram, n), n)) +
  theme_minimal() +
  labs(y = NULL, x = NULL) +
  geom_bar(stat = 'identity') +
  coord_flip()

The word clouds aren’t as helpful as the word cloud with “words”.

tokens10_bigram <- tokens_bigram |>
  arrange(across(n, desc)) |>
  head(10)
tokens10_trigram <- tokens_trigram |>
  arrange(across(n, desc)) |>
  head(10)
wordcloud(tokens10_bigram$ngram, tokens10_bigram$n)
wordcloud(tokens10_trigram$ngram, tokens10_trigram$n)

## Warning in wordcloud(tokens10_trigram$ngram, tokens10_trigram$n): prepped and
## draped could not be fit on page. It will not be plotted.

## Warning in wordcloud(tokens10_trigram$ngram, tokens10_trigram$n): tolerated the
## procedure could not be fit on page. It will not be plotted.

Question 5

Using the results you got from question 4. Pick a word and count the words that appears after or before it.

library(stringr)
# e.g., words that come before or after patient
tokens_bigram |>
  filter(str_detect(ngram, regex("\\spatient$|^patient\\s"))) |>
  mutate(word = str_remove(ngram, "patient"),
         word = str_remove_all(word, " ")) |>
  group_by(word) |>
  summarise(n = sum(n)) |>
  slice_max(n, n = 50) |>
  ggplot(aes(reorder(word, n), n)) +
  theme_minimal() +
  labs(y = NULL, x = NULL) +
  geom_bar(stat = "identity") +
  coord_flip()

# e.g., pairs of words that come before and after patient
tokens_trigram |>
  filter(str_detect(ngram, regex("\\spatient\\s"))) |>
  mutate(word = str_replace(ngram, "patient", "---")) |>
  group_by(word) |>
  summarise(n = sum(n)) |>
  slice_max(n, n = 50) |>
  ggplot(aes(reorder(word, n), n)) +
  theme_minimal() +
  labs(y = NULL, x = NULL) +
  geom_bar(stat = "identity") +
  coord_flip()

Question 6

Which words are most used in each of the specialties. you can use group_by() and ~~top_n()~~ slice_max() from dplyr to have the calculations be done within each specialty. Remember to remove stop words. How about the most 5 used words?

mt_samples |>
  unnest_tokens(word, transcription) |>
  filter(
    !word %in% stopwords("english"),
    !grepl("\\d", word)
  ) |>
  group_by(medical_specialty) |>
  count(word) |>
  slice_max(n, n = 1) |> # ties will appear
  knitr::kable()

medical_specialty	word	n
Allergy / Immunology	allergies	4
Allergy / Immunology	used	4
Bariatrics	patient	29
Cardiovascular / Pulmonary	normal	25
Dentistry	removed	10
Gastroenterology	patient	835
General Medicine	patient	1356
Hematology - Oncology	patient	316
Hospice - Palliative Care	patient	43
IME-QME-Work Comp etc.	pain	152
Lab Medicine - Pathology	cm	35
Lab Medicine - Pathology	tumor	35
Letters	pain	80
Nephrology	patient	348
Neurology	right	694
Neurosurgery	patient	374
Obstetrics / Gynecology	patient	628
Office Notes	normal	230
Ophthalmology	eye	456
Orthopedic	patient	1711
Pain Management	patient	236
Pediatrics - Neonatal	patient	247
Physical Medicine - Rehab	patient	220
Podiatry	foot	232
Psychiatry / Psychology	patient	532
Radiology	left	701
Rheumatology	history	50
SOAP / Chart / Progress Notes	patient	537
Sleep Medicine	sleep	143
Speech - Language	patient	105
Surgery	patient	4855
Urology	patient	776

mt_samples |>
  unnest_tokens(word, transcription) |>
  filter(
    !word %in% stopwords("english"),
    !grepl("\\d", word)
  ) |>
  group_by(medical_specialty) |>
  count(word) |>
  slice_max(n, n = 5) |> # ties will appear
  knitr::kable()

medical_specialty	word	n
Allergy / Immunology	allergies	4
Allergy / Immunology	used	4
Allergy / Immunology	allegra	3
Allergy / Immunology	clear	3
Allergy / Immunology	prescription	3
Allergy / Immunology	sprays	3
Bariatrics	patient	29
Bariatrics	history	21
Bariatrics	placed	21
Bariatrics	abdomen	20
Bariatrics	procedure	18
Cardiovascular / Pulmonary	normal	25
Cardiovascular / Pulmonary	left	18
Cardiovascular / Pulmonary	trachea	17
Cardiovascular / Pulmonary	valve	15
Cardiovascular / Pulmonary	aortic	12
Dentistry	removed	10
Dentistry	elevator	8
Dentistry	tooth	7
Dentistry	area	4
Dentistry	aspect	4
Dentistry	blade	4
Dentistry	bone	4
Dentistry	buccal	4
Dentistry	bur	4
Dentistry	closed	4
Dentistry	envelope	4
Dentistry	flap	4
Dentistry	follicle	4
Dentistry	gut	4
Dentistry	incision	4
Dentistry	irrigated	4
Dentistry	made	4
Dentistry	periosteal	4
Dentistry	procedure	4
Dentistry	raised	4
Dentistry	remnants	4
Gastroenterology	patient	835
Gastroenterology	procedure	455
Gastroenterology	history	324
Gastroenterology	normal	318
Gastroenterology	placed	287
General Medicine	patient	1356
General Medicine	history	1027
General Medicine	normal	717
General Medicine	pain	567
General Medicine	mg	503
Hematology - Oncology	patient	316
Hematology - Oncology	history	290
Hematology - Oncology	right	196
Hematology - Oncology	left	187
Hematology - Oncology	well	120
Hospice - Palliative Care	patient	43
Hospice - Palliative Care	mg	28
Hospice - Palliative Care	history	27
Hospice - Palliative Care	daughter	22
Hospice - Palliative Care	family	19
Hospice - Palliative Care	pain	19
IME-QME-Work Comp etc.	pain	152
IME-QME-Work Comp etc.	right	131
IME-QME-Work Comp etc.	patient	106
IME-QME-Work Comp etc.	back	105
IME-QME-Work Comp etc.	dr	82
Lab Medicine - Pathology	cm	35
Lab Medicine - Pathology	tumor	35
Lab Medicine - Pathology	x	32
Lab Medicine - Pathology	right	31
Lab Medicine - Pathology	lymph	30
Letters	pain	80
Letters	abc	71
Letters	patient	65
Letters	normal	53
Letters	mr	52
Nephrology	patient	348
Nephrology	renal	257
Nephrology	right	243
Nephrology	history	160
Nephrology	kidney	144
Neurology	right	694
Neurology	left	672
Neurology	patient	648
Neurology	normal	485
Neurology	history	429
Neurosurgery	patient	374
Neurosurgery	placed	282
Neurosurgery	right	260
Neurosurgery	procedure	247
Neurosurgery	left	222
Obstetrics / Gynecology	patient	628
Obstetrics / Gynecology	placed	350
Obstetrics / Gynecology	uterus	317
Obstetrics / Gynecology	procedure	301
Obstetrics / Gynecology	incision	293
Office Notes	normal	230
Office Notes	negative	193
Office Notes	without	99
Office Notes	patient	94
Office Notes	history	76
Ophthalmology	eye	456
Ophthalmology	patient	258
Ophthalmology	right	192
Ophthalmology	procedure	176
Ophthalmology	placed	168
Orthopedic	patient	1711
Orthopedic	right	1172
Orthopedic	left	998
Orthopedic	placed	924
Orthopedic	pain	763
Pain Management	patient	236
Pain Management	procedure	197
Pain Management	needle	156
Pain Management	placed	99
Pain Management	injected	76
Pain Management	pain	76
Pediatrics - Neonatal	patient	247
Pediatrics - Neonatal	history	235
Pediatrics - Neonatal	well	159
Pediatrics - Neonatal	normal	155
Pediatrics - Neonatal	old	93
Physical Medicine - Rehab	patient	220
Physical Medicine - Rehab	right	107
Physical Medicine - Rehab	left	104
Physical Medicine - Rehab	pain	95
Physical Medicine - Rehab	motor	62
Podiatry	foot	232
Podiatry	patient	231
Podiatry	right	156
Podiatry	left	137
Podiatry	tendon	98
Psychiatry / Psychology	patient	532
Psychiatry / Psychology	history	344
Psychiatry / Psychology	mg	183
Psychiatry / Psychology	mother	164
Psychiatry / Psychology	reported	141
Radiology	left	701
Radiology	normal	644
Radiology	right	644
Radiology	patient	304
Radiology	exam	302
Rheumatology	history	50
Rheumatology	patient	34
Rheumatology	mg	26
Rheumatology	pain	23
Rheumatology	day	22
Rheumatology	examination	22
Rheumatology	joints	22
SOAP / Chart / Progress Notes	patient	537
SOAP / Chart / Progress Notes	mg	302
SOAP / Chart / Progress Notes	history	254
SOAP / Chart / Progress Notes	pain	239
SOAP / Chart / Progress Notes	well	230
Sleep Medicine	sleep	143
Sleep Medicine	patient	69
Sleep Medicine	apnea	35
Sleep Medicine	activity	31
Sleep Medicine	stage	29
Speech - Language	patient	105
Speech - Language	therapy	41
Speech - Language	speech	35
Speech - Language	patient’s	28
Speech - Language	able	26
Surgery	patient	4855
Surgery	left	3263
Surgery	right	3261
Surgery	procedure	3243
Surgery	placed	3025
Urology	patient	776
Urology	bladder	357
Urology	right	307
Urology	procedure	306
Urology	placed	305

Question 7 - extra

Find your own insight in the data:

Ideas:

Use TF-IDF to see if certain words are used more in some specialties then others. Compare the list of words compared to the list from Question 6.

tf_idf_by_specialty <- mt_samples |>
  unnest_tokens(word, transcription) |>
  filter(
    !word %in% stopwords("english"),
    !grepl("[[:digit:]]+", word)
  ) |>
  count(word, medical_specialty) |>
  bind_tf_idf(word, medical_specialty, n)
tf_idf_by_specialty |>
  group_by(medical_specialty) |>
  slice_max(tf_idf, n = 3) |>
  filter(medical_specialty %in% c("Surgery", "Dentistry", "Allergy / Immunology")) |>
  ggplot(aes(reorder(word, tf_idf), tf_idf)) +
  theme_minimal() +
  labs(y = NULL, x = NULL) +
  geom_bar(stat = 'identity') +
  coord_flip() +
  facet_wrap(~ medical_specialty, scales = "free_y")

Sentiment analysis to see if certain specialties are more optimistic than others. How would you define “optimistic”?

sentiment_list <- get_sentiments("bing")
sentiments_in_med <- tf_idf_by_specialty |>
  left_join(sentiment_list, by = "word") |>
  group_by(medical_specialty) |>
  summarise(
    n_positive = sum(if_else(sentiment == "positive", n, 0), na.rm = TRUE),
    n_negative = sum(if_else(sentiment == "negative", n, 0), na.rm = TRUE) ,
    n = sum(n)
  )
sentiments_in_med |>
  mutate(medical_specialty = reorder(
    medical_specialty, (n_positive + n_negative) / n)) |>
  ggplot(aes(medical_specialty)) +
  theme_minimal() +
  geom_col(aes(y = -n_negative / n), fill = "pink") +
  geom_col(aes(y = n_positive / n), fill = "darkgreen") +
  labs(x = NULL, y = NULL) +
  coord_flip()

Find which specialty writes the longest sentences.

library(ggridges)
mt_samples |>
  select(medical_specialty, transcription) |>
  unnest_tokens(sentence, transcription, token = "sentences") |>
  mutate(n_words = sapply(tokenizers::tokenize_words(sentence), length)) |>
  ggplot() +
  theme_minimal() +
  geom_density_ridges(
    aes(y = reorder(medical_specialty, n_words, max), 
        x = n_words, group = medical_specialty),
    bandwidth = .5, 
    jittered_points = TRUE, point_alpha = .1,
    position = position_raincloud(height = .3, ygap = 0)
  ) +
  labs(x = "Number of words in a sentence.", y = NULL)

# Runshi' version using `stringr`
#tokenize in sentences 
tokens <- mt_samples |> select(transcription, medical_specialty) |> 
  unnest_tokens(sentence, transcription, token = "sentences")

#create summary table of sentence length for each specicalty
tokens |>
  mutate(length = stringr::str_count(sentence, "\\w+"))  |>
  filter(length > 0 & !is.na(length)) |>
  group_by(medical_specialty) |>
  summarise(
    average_length = mean(length),
    max_length = max(length),
    min_length = min(length), 
    n = n()
  ) |>
  ggplot(aes(x = average_length, y = reorder(medical_specialty, average_length))) +
  theme_minimal() +
  geom_point() +
  geom_segment(aes(xend = 0, yend = reorder(medical_specialty, average_length))) +
  labs(y = NULL, x = "Mean number of words in a sentence")

# Deliverables

Questions 1-7 answered, pdf or html output uploaded to Quercus

Lab 08 - Text Mining