1 Google Scholar Data

The data were pulled from my Google Scholar profile at the end of March 2026.

For more information, please refer to the scholar R package. It allows users to extract citation data from Google Scholar, compare multiple scholars, and even predict future h-index values. See https://github.com/YuLab-SMU/scholar for details.

library(scholar)
library(tidyverse)
library(knitr)
library(kableExtra)
library(tm)
library(wordcloud)
library(RColorBrewer)
library(dplyr)
library(purrr)
library(tidyr)
library(ggplot2)
library(stringr)
library(forcats)

id = "zpclPhcAAAAJ" # specify "your_google_scholar_id"
pubs = get_publications(id)

dat = pubs %>% 
  dplyr::select(author, title, year, cites, journal, number) %>% 
  mutate(journal = linebreak(journal)) %>%
  arrange(-year) %>% 
  relocate(cites, .after = number) %>%
  rename(Author = author, Title = title, Year = year, 
         Journal = journal, Number = number,
         `# of Cites` = cites)

1.1 List of publications

dat %>%
  kable(format = "html") %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed", "responsive"),
    font_size = 12)

2 Exploratory Analyses

2.1 Text mining application

title = dat$Title
title = gsub("\\s*\\([^\\)]+\\)", "", title)
title = gsub("[0-9]+", "", title)

title_text = Corpus(VectorSource(title))
title_text_clean = tm_map(title_text, removePunctuation)
title_text_clean = tm_map(title_text_clean, content_transformer(tolower))
title_text_clean = tm_map(title_text_clean, removeNumbers)
title_text_clean = tm_map(title_text_clean, stripWhitespace)
title_text_clean = tm_map(title_text_clean, removeWords, stopwords("english"))

par(bg = "black")
cp = brewer.pal(7, "YlOrRd")
wordcloud(title_text_clean, scale = c(2, 1), min.freq = 15, colors = cp)

2.2 Citations over time

current_year = as.integer(format(Sys.Date(), "%Y"))

safe_cite_fetch = purrr::safely(function(article_id){
  Sys.sleep(runif(1, 2, 4))
  get_article_cite_history(id, article_id)
  }
)
all_cites = map_dfr(pubs$pubid, ~{safe_cite_fetch(.x)$result})

pubs_summary = pubs %>%
  select(pubid, title, paper_year = year, total_cites = cites)

all_cites = all_cites %>%
  left_join(pubs_summary, by = "pubid")

all_cites_filled = all_cites %>%
  group_by(pubid) %>%
  tidyr::complete(year = seq(min(paper_year), current_year, 1)) %>%
  fill(title, paper_year, total_cites, .direction = "downup") %>%
  mutate(cites = replace_na(cites, 0)) %>%
  arrange(year) %>%
  ungroup()

all_cites_filled = all_cites_filled %>%
  group_by(pubid) %>%
  mutate(cum_cites = cumsum(cites)) %>%
  ungroup()

top10_info = all_cites_filled %>%
  group_by(pubid) %>%
  summarise(total_cum_cites = max(cum_cites), .groups = "drop") %>%
  arrange(desc(total_cum_cites)) %>%
  slice_head(n = 10)

cites_plot_data = all_cites_filled %>%
  filter(pubid %in% top10_info$pubid) %>%
  left_join(top10_info, by = "pubid")

my_colors = brewer.pal(10, "Set3")

ggplot(cites_plot_data, aes(
  x = factor(year),
  y = cum_cites,
  group = pubid,
  color = fct_reorder(str_trunc(title, 30), total_cum_cites))
  ) +
  geom_line(size = 1.2, alpha = 0.85) +
  scale_color_manual(values = my_colors) +
  labs(
    title = "Top 10 Most Cited Papers Over Time",
    x = "Year",
    y = "Cumulative Citations",
    color = "Paper"
  ) +
  theme_bw() +
  theme(panel.border = element_rect(color = "black", fill = NA, size = 1))

2.3 h-index over time

citations = get_citation_history(id)

ggplot(citations, aes(
  x = factor(year), 
  y = h_index)
  ) +
  geom_line(color = "blue", size = 1.2) +
  geom_point(color = "red") +
  labs(
    title = "h-index Over Time",
    x = "Year",
    y = "h-index"
  ) +
  theme_minimal()

2.4 Future h-index prediction

predict_h_index(id)

Analysis of Google Scholar data using R