Skip to content

Commit 8f6e206

Browse files
authored
Merge pull request #18 from YichenGuo82/master
readme update-Yichen
2 parents 02ee3f5 + 149e6ea commit 8f6e206

File tree

3 files changed

+259
-122
lines changed

3 files changed

+259
-122
lines changed

README.Rmd

Lines changed: 209 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
---
22
output:
3-
rmarkdown::github_document:
4-
html_preview: false
3+
rmarkdown::github_document:
4+
# html_preview: false
55
---
66

77
# Software Development Standards ![GitHub last commit](https://img.shields.io/github/last-commit/USCbiostats/software-dev)
@@ -11,116 +11,254 @@ This project's main contents are located in the project's [Wiki](wiki#welcome-to
1111
# USCbiostats R packages
1212

1313
```{r setup, include=FALSE}
14+
library(httr)
15+
library(stringr)
16+
library(knitr)
17+
library(scholar) # <--- The key difference
18+
```
19+
20+
21+
```{r, include=FALSE}
1422
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
1523
```
1624

17-
```{r listing-pkgs, echo = FALSE}
25+
```{r, include=FALSE}
26+
# We'll assume `packages.csv` has columns:
27+
# name, repo, on_bioc, scholar_id, pubid, google_scholar, description
28+
# Lines starting with '#' in CSV are ignored.
29+
1830
pkgs <- read.csv("packages.csv", comment.char = "#", stringsAsFactors = FALSE)
1931
20-
# Alphabetically ordered
21-
pkgs <- pkgs[order(pkgs$name),,drop=FALSE]
32+
# If on_bioc does not exist, create it
33+
if (!"on_bioc" %in% names(pkgs)) {
34+
pkgs$on_bioc <- FALSE
35+
} else {
36+
# Convert text "TRUE"/"FALSE" to logical
37+
pkgs$on_bioc <- ifelse(pkgs$on_bioc %in% c("TRUE","True","true"), TRUE, FALSE)
38+
}
2239
23-
# Checking cran status
40+
# Check CRAN status
2441
pkgs$on_cran <- TRUE
2542
for (i in seq_len(nrow(pkgs))) {
26-
pkg_status <- tryCatch(
27-
httr::GET(sprintf("https://cran.r-project.org/package=%s", pkgs$name[i])),
28-
error = function(e) e
29-
)
30-
31-
# Error fetching a status
32-
if (inherits(pkg_status, "error")) {
43+
nm <- pkgs$name[i]
44+
url <- sprintf("https://cran.r-project.org/package=%s", nm)
45+
resp <- tryCatch(GET(url), error = function(e) e)
46+
if (inherits(resp,"error") || status_code(resp) != 200) {
3347
pkgs$on_cran[i] <- FALSE
34-
next
3548
}
36-
37-
if (httr::status_code(pkg_status) != 200) {
38-
pkgs$on_cran[i] <- FALSE
39-
next
40-
}
41-
4249
}
4350
44-
dat <- with(pkgs, data.frame(
45-
Name = sprintf(
46-
"[**%s**](%s)", name, ifelse(!is.na(repo) & repo != "", repo, paste0("https://github.com/USCbiostats/", name))
47-
),
48-
Description = paste(
49-
description,
50-
sprintf(
51-
"[![CRAN status](https://www.r-pkg.org/badges/version/%s)](https://CRAN.R-project.org/package=%1$s)",
52-
name
53-
),
51+
# Sort packages by name
52+
pkgs <- pkgs[order(pkgs$name), , drop=FALSE]
53+
pkgs <- pkgs[!(is.na(pkgs$name) | pkgs$name == ""), ]
54+
55+
# Build the data frame that will become our final table
56+
dat <- data.frame(
57+
Name = character(nrow(pkgs)),
58+
Description = character(nrow(pkgs)),
59+
Citations = character(nrow(pkgs)), # will fill in
60+
stringsAsFactors = FALSE
61+
)
62+
63+
for (i in seq_len(nrow(pkgs))) {
64+
nm <- pkgs$name[i]
65+
repo_url <- if (!is.na(pkgs$repo[i]) && nzchar(pkgs$repo[i])) {
66+
pkgs$repo[i]
67+
} else {
68+
paste0("https://github.com/USCbiostats/", nm)
69+
}
70+
# The clickable package name
71+
dat$Name[i] <- sprintf("[**%s**](%s)", nm, repo_url)
72+
73+
desc_txt <- pkgs$description[i] # base description
74+
75+
# If on CRAN, add badges
76+
if (pkgs$on_cran[i]) {
77+
desc_txt <- paste(
78+
desc_txt,
79+
sprintf("[![CRAN status](https://www.r-pkg.org/badges/version/%1$s)](https://CRAN.R-project.org/package=%1$s)", nm),
80+
sprintf("[![CRAN downloads](https://cranlogs.r-pkg.org/badges/grand-total/%1$s)](https://CRAN.R-project.org/package=%1$s)", nm)
81+
)
82+
}
83+
84+
# If on Bioc, add a badge
85+
if (pkgs$on_bioc[i]) {
86+
desc_txt <- paste(
87+
desc_txt,
88+
# Build status shield
5489
sprintf(
55-
"[![CRAN downloads](http://cranlogs.r-pkg.org/badges/grand-total/%s)](https://cran.r-project.org/package=%1$s)",
56-
name
90+
"[![BioC build status](https://bioconductor.org/shields/build/release/bioc/%s.svg)](https://bioconductor.org/packages/release/bioc/html/%1$s.html)",
91+
nm
5792
),
93+
# Downloads rank shield
5894
sprintf(
59-
"[![status](https://tinyverse.netlify.com/badge/%s)](https://CRAN.R-project.org/package=%1$s)",
60-
name
95+
"[![BioC downloads](https://bioconductor.org/shields/downloads/release/%s.svg)](https://bioconductor.org/packages/release/bioc/html/%1$s.html)",
96+
nm)
6197
)
62-
), stringsAsFactors = FALSE))
63-
64-
test <- with(pkgs, !on_cran & is.na(on_bioc))
65-
dat$Description[!pkgs$on_cran] <- pkgs$description[!pkgs$on_cran]
66-
67-
for (pkg in pkgs$name[which(pkgs$on_bioc)]) {
68-
dat[which(pkgs$name == pkg), "Description"] <-
69-
paste0(pkgs$description[pkgs$name == pkg],
70-
sprintf("[![](https://img.shields.io/badge/Bioconductor%%20version-1.0.0-green.svg)](https://www.bioconductor.org/packages/%s)", pkg),
71-
badger::badge_bioc_download(pkg, "total", "blue", "total"))
98+
}
99+
100+
dat$Description[i] <- desc_txt
72101
}
73102
103+
# Initialize Citations
104+
dat$Citations <- ""
74105
```
75106

76-
```{r citations, include=FALSE}
77-
regex <- "([0-9,]+)[\\s\\n]+results?[\\s\\n]+\\([\\s\\n]*[0-9]+" #[\\s\\n]+[(][0-9]*[.]?[0-9]+\\s+secs?
78-
dat$Citations <- ""
79-
tot_citations <- 0L
80-
for (i in seq_len(nrow(pkgs))) {
107+
```{r, include=FALSE}
108+
# -----------------------------
109+
# 1) Scholar approach:
110+
# -----------------------------
111+
get_scholar_citation_count <- function(sid, pubid, pkg_name) {
112+
# If there's a specific publication ID
113+
if (!is.na(pubid) && nzchar(pubid)) {
114+
# Use get_article_cite_history() + sum the 'cites'
115+
article_hist <- tryCatch(
116+
get_article_cite_history(sid, pubid),
117+
error = function(e) NULL
118+
)
119+
if (is.data.frame(article_hist) && nrow(article_hist) > 0 && "cites" %in% names(article_hist)) {
120+
return(sum(article_hist$cites))
121+
} else {
122+
return(NA_integer_)
123+
}
124+
} else {
125+
# Otherwise, fallback to the fuzzy match on package name in get_publications()
126+
pubs <- tryCatch(
127+
get_publications(sid),
128+
error = function(e) NULL
129+
)
130+
if (!is.null(pubs) && is.data.frame(pubs) && nrow(pubs) > 0) {
131+
idx <- which(grepl(pkg_name, pubs$title, ignore.case=TRUE))
132+
if (length(idx) > 0) {
133+
# Return the first match's total cites
134+
return(pubs$cites[idx[1]])
135+
}
136+
}
137+
return(NA_integer_)
138+
}
139+
}
140+
141+
# -----------------------------
142+
# 2) Old HTML scraping approach:
143+
# -----------------------------
144+
# We'll define a function that tries to parse a Google Scholar URL (like ?cites=...)
145+
# using readLines or GET+iconv, then run a regex to find "XXX results" lines.
146+
# If found, return XXX as integer. Otherwise NA.
147+
get_html_scrape_citation_count <- function(gs_url) {
81148
82-
# If no URL, then continue
83-
if (nchar(pkgs$google_scholar[i]) == 0) {
84-
next
149+
if (is.na(gs_url) || !nzchar(gs_url)) {
150+
return(NA_integer_)
85151
}
86152
87-
# Otherwise, take a look at the cictations
88-
address <- pkgs$google_scholar[i]
153+
# We'll fetch as raw and convert.
154+
page_txt <- tryCatch({
155+
resp <- httr::GET(gs_url)
156+
if (httr::status_code(resp) != 200) {
157+
stop("HTTP status not 200")
158+
}
159+
raw_ct <- httr::content(resp, as="raw")
160+
txt <- iconv(rawToChar(raw_ct, multiple=TRUE), from="UTF-8", to="UTF-8", sub="byte")
161+
txt
162+
}, error = function(e) {
163+
return(NULL)
164+
})
89165
90-
page <- tryCatch(readLines(address, warn = FALSE), error = function(e) e)
91-
92-
if (inherits(page, "error"))
93-
next
166+
if (is.null(page_txt)) {
167+
return(NA_integer_)
168+
}
94169
95-
# Removing blocks of bold, italic, etc
96-
page <- gsub("\\<[[:alnum:]_/-]+\\>", "",page, perl = TRUE)
170+
# We'll split into lines
171+
lines <- strsplit(page_txt, "\n", fixed=TRUE)[[1]]
97172
98-
citations <- which(grepl(regex, page, perl = TRUE))
99-
if (!length(citations))
100-
next
173+
# Remove some tags. (Might or might not help.)
174+
lines <- gsub("\\<[[:alnum:]_/-]+\\>", "", lines, perl=TRUE)
101175
102-
citations <- stringr::str_extract(page[citations], "[0-9,]+(?=[\\s\\n]+results?)")
103-
citations <- as.integer(gsub("[,.]", "", citations, perl = TRUE))
104-
tot_citations <- tot_citations + citations
105-
dat$Citations[i] <- sprintf("[%i](%s)", as.integer(citations), address)
176+
# The old code used a regex looking for something like "123 results (0.23 sec)"
177+
# e.g. "([0-9,]+)[\\s\\n]+results?[\\s\\n]+\\([\\s\\n]*[0-9]+"
178+
# But Scholar might say "About 123 results..."
179+
# So we can attempt a simpler approach:
180+
# "About X results" or "X results"
181+
re <- "About\\s+([0-9,]+)\\s+results\\s*(\\([^)]*\\))?|
182+
([0-9,]+)\\s+results\\s*(\\([^)]*\\))?"
183+
# We'll try both capturing groups
184+
m <- regexpr(re, lines, perl=TRUE, ignore.case=TRUE)
185+
# Find the first line that matches
186+
line_idx <- which(m != -1)
187+
if (length(line_idx) == 0) {
188+
return(NA_integer_)
189+
}
190+
# We'll just pick the first match
191+
line_of_interest <- lines[line_idx[1]]
192+
193+
# Extract the numeric portion
194+
# We'll do two sub captures, so:
195+
match_txt <- regmatches(line_of_interest, m[1])
196+
197+
# We'll use a simpler approach with stringr if you prefer:
198+
library(stringr)
199+
# This pattern tries to find numbers in the text
200+
nums_found <- str_extract_all(line_of_interest, "[0-9,]+")[[1]]
201+
if (length(nums_found) == 0) {
202+
return(NA_integer_)
203+
}
106204
205+
# Convert e.g. "1,234" -> 1234
206+
cites_int <- as.integer(gsub("[^0-9]", "", nums_found[1]))
207+
cites_int
107208
}
108209
109-
if (tot_citations == 0L)
110-
stop("There can't be 0 citations! Make sure things are running as expected!")
210+
tot_citations <- 0L
211+
212+
# Now we loop over each package row
213+
for (i in seq_len(nrow(pkgs))) {
214+
215+
pkg_name <- pkgs$name[i]
216+
sid <- if ("scholar_id" %in% names(pkgs)) pkgs$scholar_id[i] else NA_character_
217+
pubid <- if ("pubid" %in% names(pkgs)) pkgs$pubid[i] else NA_character_
218+
old_link <- if ("google_scholar" %in% names(pkgs)) pkgs$google_scholar[i] else NA_character_
219+
220+
cval <- NA_integer_
221+
222+
# 1) Try scholar approach if sid is not empty
223+
if (!is.na(sid) && nzchar(sid)) {
224+
cval <- get_scholar_citation_count(sid, pubid, pkg_name)
225+
if (!is.na(cval) && cval >= 0) {
226+
# If we got a valid integer from Scholar
227+
if (!is.na(pubid) && nzchar(pubid)) {
228+
# We have a link to the actual publication
229+
dat$Citations[i] <- sprintf("[%d](%s)", cval, old_link)
230+
231+
} else {
232+
# We only have the count, no direct pub link
233+
dat$Citations[i] <- as.character(cval)
234+
}
235+
tot_citations <- tot_citations + cval
236+
next # Done with this package
237+
}
238+
}
239+
240+
# 2) Fallback: old HTML approach using google_scholar column
241+
cval_html <- get_html_scrape_citation_count(old_link)
242+
if (!is.na(cval_html) && cval_html >= 0) {
243+
dat$Citations[i] <- sprintf("[%d](%s)", cval_html, old_link)
244+
tot_citations <- tot_citations + cval_html
245+
}
246+
}
111247
```
112248

113-
As of `r Sys.Date()`, the packages listed here have been cited **`r tot_citations`** times
114-
(source: Google Scholar).
115249

116250
```{r printing, echo = FALSE}
117251
knitr::kable(dat, row.names = FALSE)
118252
```
119253

254+
As of `r Sys.Date()`, the packages listed here have been cited **`r tot_citations`** times
255+
(source: Google Scholar).
256+
120257
To update this list, modify the file [packages.csv](packages.csv). The
121258
`README.md` file is updated automatically using GitHub Actions, so there's no
122259
need to "manually" recompile the README file after updating the list.
123260

261+
124262
# Coding Standards
125263

126264
1. [Coding Standards](wiki#coding-standards)

0 commit comments

Comments
 (0)