1 Data pre-processing

The data includes all NIH-awarded grants in the United States for FY25 up to the date specified above.

library(wordcloud)
library(tm)
library(dplyr)
library(tidytext)
library(ggplot2)
library(pander)

wd = getwd()
filename = paste0(wd, "/SearchResult_Export_01Apr2025_012331.csv")
data = read.csv(filename)

df = data %>%
  group_by(Administering.IC) %>%
  rename(agency = Administering.IC) %>%
  summarise(total.number = n(),
            total.cost = sum(Total.Cost, na.rm = T)) %>%
  mutate(total.cost.dollars = scales::label_number(accuracy = 0.1, 
                                                   scale_cut = scales::cut_short_scale())(total.cost)) %>%
  select(agency, total.number, total.cost, total.cost.dollars) %>%
  arrange(desc(total.number))

2 Total cost of funded grants

ggplot(df) +
  geom_bar(aes(y = reorder(agency, total.cost), x = total.cost), position = "stack", stat = "identity") +
  xlim(0, max(df$total.cost)*1.1) +
  geom_text(aes(y = reorder(agency, total.cost), x = total.cost, label = total.cost.dollars),
            hjust = -0.2, col = "#0b6fa1") + 
  labs(x = "Agency \n", y = "\n Total Costs") +
  theme(axis.title.x = element_text(face="bold", colour="darkblue", size = 12, vjust = -1.5),
        axis.title.y = element_text(face="bold", colour="darkblue", size = 12, vjust = 6))

3 Total number of funded grants

ggplot(df) +
  geom_bar(aes(y=reorder(agency, total.number), x=total.number), position="stack", stat="identity") +
  xlim(0, max(df$total.number)*1.1) +
  geom_text(aes(y = reorder(agency, total.number), x = total.number, label = total.number),
            hjust = -0.2, col = "#0b6fa1") +
  labs(x = "Agency \n", y = "\n Total Number") +
  theme(axis.title.x = element_text(face="bold", colour="darkblue", size = 12, vjust = -1.5),
        axis.title.y = element_text(face="bold", colour="darkblue", size = 12, vjust = 6))

4 Frequent words in project titles of funded grants

title = data$Project.Title
title = gsub("\\s*\\([^\\)]+\\)", "", title)
title = gsub("[0-9]+", "", title)

title_text = Corpus(VectorSource(title))
title_text_clean = tm_map(title_text, removePunctuation)
title_text_clean = tm_map(title_text_clean, content_transformer(tolower))
title_text_clean = tm_map(title_text_clean, removeNumbers)
title_text_clean = tm_map(title_text_clean, stripWhitespace)
title_text_clean = tm_map(title_text_clean, removeWords, stopwords("english"))

par(bg = "black")
cp = brewer.pal(7, "YlOrRd")
wordcloud(title_text_clean, scale = c(2, 1), min.freq = 150, colors = cp)#rainbow(30))

title_df = data_frame(Text = title)

freq_words = title_df %>%
  unnest_tokens(output = word, input = Text)

freq_words = freq_words %>%
  anti_join(stop_words)

freq_wordcounts = freq_words %>% count(word, sort = TRUE)

freq_wordcounts %>% 
  filter(n >= 50) %>% 
  mutate(word = reorder(word, n)) %>% 
  ggplot(aes(word, n)) + 
  geom_col() +
  coord_flip() +
  labs(x = "Word \n", y = "\n Count ", title = "Frequent Words In Title \n") +
  geom_text(aes(label = n), hjust = 1.2, colour = "white", fontface = "bold") +
  theme(plot.title = element_text(hjust = 0.5), 
        axis.title.x = element_text(face="bold", colour="darkblue", size = 12),
        axis.title.y = element_text(face="bold", colour="darkblue", size = 12))

5 Top 30 frequent words by agency

K = length(df$agency)

for(i in 1:K){
  
  pander::pandoc.header(df$agency[i], level = 2)
  
  findata = data %>%
    filter(Administering.IC == df$agency[i])
  
  title = findata$Project.Title
  title = gsub("\\s*\\([^\\)]+\\)", "", title)
  title = gsub("[0-9]+", "", title)

  title_df = data_frame(Text = title)

  freq_words = title_df %>%
    unnest_tokens(output = word, input = Text)
  
  freq_words = freq_words %>%
    anti_join(stop_words)
  
  freq_wordcounts = freq_words %>% count(word, sort = TRUE)
    
  print(freq_wordcounts %>% 
    head(., 30) %>%
    mutate(word = reorder(word, n)) %>% 
    ggplot(aes(word, n)) + 
    geom_col() +
    coord_flip() +
    labs(x = "Word \n", y = "\n Count ", title = "Frequent Words In Title \n") +
    geom_text(aes(label = n), hjust = 1.2, colour = "white", fontface = "bold") +
    theme(plot.title = element_text(hjust = 0.5), 
          axis.title.x = element_text(face="bold", colour="darkblue", size = 12),
          axis.title.y = element_text(face="bold", colour="darkblue", size = 12)))
  
  cat("\n\n")
}

5.1 NCI

5.2 NIAID

5.3 NHLBI

5.4 NINDS

5.5 NIGMS

5.6 NIA

5.7 NIMH

5.8 NIDDK

5.9 NICHD

5.10 NEI

5.11 NIEHS

5.12 NIDA

5.13 NIDCD

5.14 NIAMS

5.15 NIBIB

5.16 NIMHD

5.17 NIDCR

5.18 OD

5.19 NHGRI

5.20 NIAAA

5.21 FIC

5.22 NCCIH

5.23 NINR

5.24 NCATS

5.25 NLM