knitr::opts_chunk$set()

Learning goals

Use unnest_tokens() and unnest_ngrams() to extract tokens and ngrams from text.
Use dplyr and ggplot2 to analyze text data

Lab description

For this lab we will be working with the medical record transcriptions from https://www.mtsamples.com/. And is loaded and “fairly” cleaned at https://github.com/JSC370/jsc370-2023/blob/main/data/medical_transcriptions/.

Setup packages

You should load in dplyr, (or data.table if you want to work that way), ggplot2 and tidytext. If you don’t already have tidytext then you can install with

install.packages("tidytext")

Read in Medical Transcriptions

Loading in reference transcription samples from https://www.mtsamples.com/

library(tidytext)
library(readr)
library(dplyr)
library(tidyr)
library(ggplot2)
data_url <- paste0(
  "https://raw.githubusercontent.com/JSC370/",
  "jsc370-2023/main/data/medical_transcriptions/mtsamples.csv"
  )
mt_samples <- read_csv(data_url)
mt_samples <- mt_samples |>
  select(description, medical_specialty, transcription)
head(mt_samples)

## # A tibble: 6 × 3
##   description                                                    medic…¹ trans…²
##   <chr>                                                          <chr>   <chr>  
## 1 A 23-year-old white female presents with complaint of allergi… Allerg… "SUBJE…
## 2 Consult for laparoscopic gastric bypass.                       Bariat… "PAST …
## 3 Consult for laparoscopic gastric bypass.                       Bariat… "HISTO…
## 4 2-D M-Mode. Doppler.                                           Cardio… "2-D M…
## 5 2-D Echocardiogram                                             Cardio… "1.  T…
## 6 Morbid obesity.  Laparoscopic antecolic antegastric Roux-en-Y… Bariat… "PREOP…
## # … with abbreviated variable names ¹medical_specialty, ²transcription

Question 1: What specialties do we have?

We can use count() from dplyr to figure out how many different categories we have. Are these categories related? overlapping? evenly distributed?

mt_samples |> 
  count(medical_specialty, sort = TRUE) |>
  mutate(medical_specialty = forcats::fct_reorder(medical_specialty, n)) |>
  ggplot(aes(medical_specialty, n)) +
  theme_minimal() +
  geom_col() +
  coord_flip() +
  labs(x = NULL, y = NULL)

Question 2

Tokenize the the words in the transcription column
Count the number of times each token appears
Visualize the top 20 most frequent words

Explain what we see from this result. Does it makes sense? What insights (if any) do we get?

tokens <- mt_samples |>
  select(transcription) |>
  unnest_tokens(word, transcription) |>
  count(word)

tokens |>
  slice_max(n, n = 20) |>
  ggplot(aes(reorder(word, n), n)) +
  theme_minimal() +
  labs(y = NULL, x = NULL) +
  geom_bar(stat = "identity") +
  coord_flip()

Question 3

Redo visualization for the top 20 most frequent words after removing stop words
Bonus points if you remove numbers as well

What do we see know that we have removed stop words? Does it give us a better idea of what the text is about?

library(stopwords)
head(stopwords("english"))

## [1] "i"      "me"     "my"     "myself" "we"     "our"

length(stopwords("english"))

## [1] 175

tokens_no_stopwords <- tokens |>
  filter(
    !word %in% stopwords("english") # remove stop words
    # remove numbers:
    # you can use grepl() or
    # stringr::str_detect()
  ) 
tokens_no_stopwords |>
  slice_max(n, n = 20) |>
  ggplot(aes(reorder(word, n), n)) +
  theme_minimal() +
  labs(y = NULL, x = NULL) +
  geom_bar(stat = "identity") +
  coord_flip()

Another method for visualizing word counts is using a word cloud via wordcloud::wordcloud(). Create a word cloud for the top 50 most frequent words after removing stop words (and numbers).

library(wordcloud)

## Loading required package: RColorBrewer

tokens50 <- tokens_no_stopwords |>
  slice_max(n, n = 50) 
wordcloud(tokens50$word, tokens50$n,
          colors = brewer.pal(8, "Set2"))

Question 4

Repeat question 3, but this time tokenize into bi-grams. How does the result change if you look at tri-grams? (You don’t need to create the word clouds.)

# start with any of the stop words
sw_start <- paste0("^", paste(stopwords("english"), collapse = " |^"), " ")
# end with any of the stop words
sw_end <- paste0(" ", paste(stopwords("english"), collapse = "$| "), "$")

tokens_bigram <- mt_samples |>
  select(transcription) |>
  unnest_tokens(ngram, transcription, token = "ngrams", n = 2) |>
  filter(
    # remove those with stop words
    !grepl(sw_start, ngram, ignore.case = TRUE),
    !grepl(sw_end, ngram, ignore.case = TRUE)
    # remove numbers
  ) |>
  count(ngram)

tokens_bigram |>
  slice_max(n, n = 20) |>
  ggplot(aes(reorder(ngram, n), n)) +
  theme_minimal() +
  labs(y = NULL, x = NULL) +
  geom_bar(stat = "identity") +
  coord_flip()

Question 5

Using the results you got from question 4. Pick a word and count the words that appears after or before it. (Also, try finding pairs of words after and before the word.)

library(stringr)
# e.g., patient
tokens_bigram |>
  filter(str_detect(ngram, regex("\\spatient$|^patient\\s"))) |>
  mutate(word = str_remove(ngram, "patient"),
         word = str_remove_all(word, " ")) |>
  group_by(word) |>
  summarise(n = sum(n)) |>
  slice_max(n, n = 50) |>
  ggplot(aes(reorder(word, n), n)) +
  theme_minimal() +
  labs(y = NULL, x = NULL) +
  geom_bar(stat = "identity") +
  coord_flip()

Question 6

Which words are most used in each of the specialties. you can use group_by() and slice_max() from dplyr to have the calculations be done within each specialty. Remember to remove stop words. How about the most 5 used words?

Question 7 - extra

Find your own insight in the data:

Ideas:

Use TF-IDF to see if certain words are used more in some specialties then others. Compare the list of words compared to the list from Question 6.

tf_idf_by_specialty <- mt_samples |>
  unnest_tokens(word, transcription) |>
  filter(
    !word %in% stopwords("english")
    # try removing numbers
  ) |>
  count(word, medical_specialty) |>
  bind_tf_idf(word, medical_specialty, n)
tf_idf_by_specialty |>
  group_by(medical_specialty) |>
  slice_max(tf_idf, n = 5) |>
  filter(
    medical_specialty %in% c("Surgery", "Dentistry", "Allergy / Immunology")
  ) |>
  ggplot(aes(reorder(word, tf_idf), tf_idf)) +
  theme_minimal() +
  labs(y = NULL, x = NULL) +
  geom_bar(stat = "identity") +
  coord_flip() +
  facet_wrap(~ medical_specialty, scales = "free_y")

mt_samples |>
  filter(str_detect(transcription, " elevator "))

## # A tibble: 65 × 3
##    description                                                   medic…¹ trans…²
##    <chr>                                                         <chr>   <chr>  
##  1 Surgical removal of completely bony impacted teeth #1, #16, … Dentis… PREOPE…
##  2 Total hip arthroplasty on the left.  Left hip degenerative a… Surgery PREOPE…
##  3 Tonsillectomy, uvulopalatopharyngoplasty, and septoplasty fo… Surgery PREOPE…
##  4 Left mesothelioma, focal.  Left anterior pleural-based nodul… Surgery PREOPE…
##  5 Extraction of tooth #T and incision and drainage (I&D) of ri… Surgery PREOPE…
##  6 Extraction of teeth #2 and #19 and incision and drainage (I&… Surgery PREOPE…
##  7 Right argon laser assisted stapedectomy.  Bilateral conducti… Surgery PREOPE…
##  8 Open reduction and internal plate and screw fixation of depr… Surgery PREOPE…
##  9 Laparoscopic right salpingooophorectomy.  Right pelvic pain … Surgery PREOPE…
## 10 Revision rhinoplasty and left conchal cartilage harvest to c… Surgery PREOPE…
## # … with 55 more rows, and abbreviated variable names ¹medical_specialty,
## #   ²transcription

Sentiment analysis to see if certain specialties are more optimistic than others. How would you define “optimistic”?

sentiment_list <- get_sentiments("bing")

sentiments_in_med <- tf_idf_by_specialty |>
  left_join(sentiment_list, by = "word") 

sentiments_in_med_by_sp <- sentiments_in_med |>
  group_by(medical_specialty) |>
  summarise(
    n_positive = sum(ifelse(sentiment == "positive", n, 0), na.rm = TRUE),
    n_negative = sum(ifelse(sentiment == "negative", n, 0), na.rm = TRUE),
    n = sum(n)
  )

sentiments_in_med_by_sp |>
  ggplot(aes(reorder(medical_specialty, (n_negative + n_positive) / n))) +
  theme_minimal() +
  geom_col(aes(y = - n_negative / n), fill = "pink") +
  geom_col(aes(y = n_positive / n), fill = "darkgreen") +
  labs(x = NULL, y = NULL) +
  coord_flip()

Find which specialty writes the longest sentences.

Deliverables

Questions 1-7 answered, pdf or html output uploaded to Quercus

Lab 08 - Text Mining