1 Data pre-processing

The data includes all NIH-awarded grants in the United States for FY25 up to the date specified above.

library(wordcloud)
library(tm)
library(dplyr)
library(tidytext)
library(ggplot2)
library(pander)

wd = getwd()
filename = paste0(wd, "/SearchResult_Export_01Apr2025_012331.csv")
data = read.csv(filename)

df = data %>%
  group_by(Administering.IC) %>%
  rename(agency = Administering.IC) %>%
  summarise(total.number = n(),
            total.cost = sum(Total.Cost, na.rm = T)) %>%
  mutate(total.cost.dollars = scales::label_number(accuracy = 0.1, 
                                                   scale_cut = scales::cut_short_scale())(total.cost)) %>%
  select(agency, total.number, total.cost, total.cost.dollars) %>%
  arrange(desc(total.number))

2 Total cost of funded grants

ggplot(df) +
  geom_bar(aes(y = reorder(agency, total.cost), x = total.cost), position = "stack", stat = "identity") +
  xlim(0, max(df$total.cost)*1.1) +
  geom_text(aes(y = reorder(agency, total.cost), x = total.cost, label = total.cost.dollars),
            hjust = -0.2, col = "#0b6fa1") + 
  labs(x = "Agency \n", y = "\n Total Costs") +
  theme(axis.title.x = element_text(face="bold", colour="darkblue", size = 12, vjust = -1.5),
        axis.title.y = element_text(face="bold", colour="darkblue", size = 12, vjust = 6))

3 Total number of funded grants

ggplot(df) +
  geom_bar(aes(y=reorder(agency, total.number), x=total.number), position="stack", stat="identity") +
  xlim(0, max(df$total.number)*1.1) +
  geom_text(aes(y = reorder(agency, total.number), x = total.number, label = total.number),
            hjust = -0.2, col = "#0b6fa1") +
  labs(x = "Agency \n", y = "\n Total Number") +
  theme(axis.title.x = element_text(face="bold", colour="darkblue", size = 12, vjust = -1.5),
        axis.title.y = element_text(face="bold", colour="darkblue", size = 12, vjust = 6))

4 Frequent words in project titles of funded grants

title = data$Project.Title
title = gsub("\\s*\\([^\\)]+\\)", "", title)
title = gsub("[0-9]+", "", title)

title_text = Corpus(VectorSource(title))
title_text_clean = tm_map(title_text, removePunctuation)
title_text_clean = tm_map(title_text_clean, content_transformer(tolower))
title_text_clean = tm_map(title_text_clean, removeNumbers)
title_text_clean = tm_map(title_text_clean, stripWhitespace)
title_text_clean = tm_map(title_text_clean, removeWords, stopwords("english"))

par(bg = "black")
cp = brewer.pal(7, "YlOrRd")
wordcloud(title_text_clean, scale = c(2, 1), min.freq = 150, colors = cp)#rainbow(30))

title_df = data_frame(Text = title)

freq_words = title_df %>%
  unnest_tokens(output = word, input = Text)

freq_words = freq_words %>%
  anti_join(stop_words)

freq_wordcounts = freq_words %>% count(word, sort = TRUE)

freq_wordcounts %>% 
  filter(n >= 50) %>% 
  mutate(word = reorder(word, n)) %>% 
  ggplot(aes(word, n)) + 
  geom_col() +
  coord_flip() +
  labs(x = "Word \n", y = "\n Count ", title = "Frequent Words In Title \n") +
  geom_text(aes(label = n), hjust = 1.2, colour = "white", fontface = "bold") +
  theme(plot.title = element_text(hjust = 0.5), 
        axis.title.x = element_text(face="bold", colour="darkblue", size = 12),
        axis.title.y = element_text(face="bold", colour="darkblue", size = 12))

5 Top 30 frequent words by agency

K = length(df$agency)

for(i in 1:K){
  
  pander::pandoc.header(df$agency[i], level = 2)
  
  findata = data %>%
    filter(Administering.IC == df$agency[i])
  
  title = findata$Project.Title
  title = gsub("\\s*\\([^\\)]+\\)", "", title)
  title = gsub("[0-9]+", "", title)

  title_df = data_frame(Text = title)

  freq_words = title_df %>%
    unnest_tokens(output = word, input = Text)
  
  freq_words = freq_words %>%
    anti_join(stop_words)
  
  freq_wordcounts = freq_words %>% count(word, sort = TRUE)
    
  print(freq_wordcounts %>% 
    head(., 30) %>%
    mutate(word = reorder(word, n)) %>% 
    ggplot(aes(word, n)) + 
    geom_col() +
    coord_flip() +
    labs(x = "Word \n", y = "\n Count ", title = "Frequent Words In Title \n") +
    geom_text(aes(label = n), hjust = 1.2, colour = "white", fontface = "bold") +
    theme(plot.title = element_text(hjust = 0.5), 
          axis.title.x = element_text(face="bold", colour="darkblue", size = 12),
          axis.title.y = element_text(face="bold", colour="darkblue", size = 12)))
  
  cat("\n\n")
}

Text mining and visualizations of frequent words

Oh, Eun Jeong

2025-04-01

1 Data pre-processing

2 Total cost of funded grants

3 Total number of funded grants

4 Frequent words in project titles of funded grants

5 Top 30 frequent words by agency

5.1 NCI

5.2 NIAID

5.3 NHLBI

5.4 NINDS

5.5 NIGMS

5.6 NIA

5.7 NIMH

5.8 NIDDK

5.9 NICHD

5.10 NEI

5.11 NIEHS

5.12 NIDA

5.13 NIDCD

5.14 NIAMS

5.15 NIBIB

5.16 NIMHD

5.17 NIDCR

5.18 OD

5.19 NHGRI

5.20 NIAAA

5.21 FIC

5.22 NCCIH

5.23 NINR

5.24 NCATS

5.25 NLM