11---
22output :
3- rmarkdown::github_document :
4- html_preview : false
3+ rmarkdown::github_document :
4+ # html_preview: false
55---
66
77# Software Development Standards ![ GitHub last commit] ( https://img.shields.io/github/last-commit/USCbiostats/software-dev )
@@ -11,116 +11,254 @@ This project's main contents are located in the project's [Wiki](wiki#welcome-to
1111# USCbiostats R packages
1212
1313``` {r setup, include=FALSE}
14+ library(httr)
15+ library(stringr)
16+ library(knitr)
17+ library(scholar) # <--- The key difference
18+ ```
19+
20+
21+ ``` {r, include=FALSE}
1422knitr::opts_chunk$set(warning = FALSE, message = FALSE)
1523```
1624
17- ``` {r listing-pkgs, echo = FALSE}
25+ ``` {r, include=FALSE}
26+ # We'll assume `packages.csv` has columns:
27+ # name, repo, on_bioc, scholar_id, pubid, google_scholar, description
28+ # Lines starting with '#' in CSV are ignored.
29+
1830pkgs <- read.csv("packages.csv", comment.char = "#", stringsAsFactors = FALSE)
1931
20- # Alphabetically ordered
21- pkgs <- pkgs[order(pkgs$name),,drop=FALSE]
32+ # If on_bioc does not exist, create it
33+ if (!"on_bioc" %in% names(pkgs)) {
34+ pkgs$on_bioc <- FALSE
35+ } else {
36+ # Convert text "TRUE"/"FALSE" to logical
37+ pkgs$on_bioc <- ifelse(pkgs$on_bioc %in% c("TRUE","True","true"), TRUE, FALSE)
38+ }
2239
23- # Checking cran status
40+ # Check CRAN status
2441pkgs$on_cran <- TRUE
2542for (i in seq_len(nrow(pkgs))) {
26- pkg_status <- tryCatch(
27- httr::GET(sprintf("https://cran.r-project.org/package=%s", pkgs$name[i])),
28- error = function(e) e
29- )
30-
31- # Error fetching a status
32- if (inherits(pkg_status, "error")) {
43+ nm <- pkgs$name[i]
44+ url <- sprintf("https://cran.r-project.org/package=%s", nm)
45+ resp <- tryCatch(GET(url), error = function(e) e)
46+ if (inherits(resp,"error") || status_code(resp) != 200) {
3347 pkgs$on_cran[i] <- FALSE
34- next
3548 }
36-
37- if (httr::status_code(pkg_status) != 200) {
38- pkgs$on_cran[i] <- FALSE
39- next
40- }
41-
4249}
4350
44- dat <- with(pkgs, data.frame(
45- Name = sprintf(
46- "[**%s**](%s)", name, ifelse(!is.na(repo) & repo != "", repo, paste0("https://github.com/USCbiostats/", name))
47- ),
48- Description = paste(
49- description,
50- sprintf(
51- "[](https://CRAN.R-project.org/package=%1$s)",
52- name
53- ),
51+ # Sort packages by name
52+ pkgs <- pkgs[order(pkgs$name), , drop=FALSE]
53+ pkgs <- pkgs[!(is.na(pkgs$name) | pkgs$name == ""), ]
54+
55+ # Build the data frame that will become our final table
56+ dat <- data.frame(
57+ Name = character(nrow(pkgs)),
58+ Description = character(nrow(pkgs)),
59+ Citations = character(nrow(pkgs)), # will fill in
60+ stringsAsFactors = FALSE
61+ )
62+
63+ for (i in seq_len(nrow(pkgs))) {
64+ nm <- pkgs$name[i]
65+ repo_url <- if (!is.na(pkgs$repo[i]) && nzchar(pkgs$repo[i])) {
66+ pkgs$repo[i]
67+ } else {
68+ paste0("https://github.com/USCbiostats/", nm)
69+ }
70+ # The clickable package name
71+ dat$Name[i] <- sprintf("[**%s**](%s)", nm, repo_url)
72+
73+ desc_txt <- pkgs$description[i] # base description
74+
75+ # If on CRAN, add badges
76+ if (pkgs$on_cran[i]) {
77+ desc_txt <- paste(
78+ desc_txt,
79+ sprintf("[](https://CRAN.R-project.org/package=%1$s)", nm),
80+ sprintf("[](https://CRAN.R-project.org/package=%1$s)", nm)
81+ )
82+ }
83+
84+ # If on Bioc, add a badge
85+ if (pkgs$on_bioc[i]) {
86+ desc_txt <- paste(
87+ desc_txt,
88+ # Build status shield
5489 sprintf(
55- "[](https://cran.r-project. org/package= %1$s)",
56- name
90+ "[](https://bioconductor. org/packages/release/bioc/html/ %1$s.html )",
91+ nm
5792 ),
93+ # Downloads rank shield
5894 sprintf(
59- "[](https://CRAN.R-project. org/package= %1$s)",
60- name
95+ "[](https://bioconductor. org/packages/release/bioc/html/ %1$s.html )",
96+ nm)
6197 )
62- ), stringsAsFactors = FALSE))
63-
64- test <- with(pkgs, !on_cran & is.na(on_bioc))
65- dat$Description[!pkgs$on_cran] <- pkgs$description[!pkgs$on_cran]
66-
67- for (pkg in pkgs$name[which(pkgs$on_bioc)]) {
68- dat[which(pkgs$name == pkg), "Description"] <-
69- paste0(pkgs$description[pkgs$name == pkg],
70- sprintf("[](https://www.bioconductor.org/packages/%s)", pkg),
71- badger::badge_bioc_download(pkg, "total", "blue", "total"))
98+ }
99+
100+ dat$Description[i] <- desc_txt
72101}
73102
103+ # Initialize Citations
104+ dat$Citations <- ""
74105```
75106
76- ``` {r citations, include=FALSE}
77- regex <- "([0-9,]+)[\\s\\n]+results?[\\s\\n]+\\([\\s\\n]*[0-9]+" #[\\s\\n]+[(][0-9]*[.]?[0-9]+\\s+secs?
78- dat$Citations <- ""
79- tot_citations <- 0L
80- for (i in seq_len(nrow(pkgs))) {
107+ ``` {r, include=FALSE}
108+ # -----------------------------
109+ # 1) Scholar approach:
110+ # -----------------------------
111+ get_scholar_citation_count <- function(sid, pubid, pkg_name) {
112+ # If there's a specific publication ID
113+ if (!is.na(pubid) && nzchar(pubid)) {
114+ # Use get_article_cite_history() + sum the 'cites'
115+ article_hist <- tryCatch(
116+ get_article_cite_history(sid, pubid),
117+ error = function(e) NULL
118+ )
119+ if (is.data.frame(article_hist) && nrow(article_hist) > 0 && "cites" %in% names(article_hist)) {
120+ return(sum(article_hist$cites))
121+ } else {
122+ return(NA_integer_)
123+ }
124+ } else {
125+ # Otherwise, fallback to the fuzzy match on package name in get_publications()
126+ pubs <- tryCatch(
127+ get_publications(sid),
128+ error = function(e) NULL
129+ )
130+ if (!is.null(pubs) && is.data.frame(pubs) && nrow(pubs) > 0) {
131+ idx <- which(grepl(pkg_name, pubs$title, ignore.case=TRUE))
132+ if (length(idx) > 0) {
133+ # Return the first match's total cites
134+ return(pubs$cites[idx[1]])
135+ }
136+ }
137+ return(NA_integer_)
138+ }
139+ }
140+
141+ # -----------------------------
142+ # 2) Old HTML scraping approach:
143+ # -----------------------------
144+ # We'll define a function that tries to parse a Google Scholar URL (like ?cites=...)
145+ # using readLines or GET+iconv, then run a regex to find "XXX results" lines.
146+ # If found, return XXX as integer. Otherwise NA.
147+ get_html_scrape_citation_count <- function(gs_url) {
81148
82- # If no URL, then continue
83- if (nchar(pkgs$google_scholar[i]) == 0) {
84- next
149+ if (is.na(gs_url) || !nzchar(gs_url)) {
150+ return(NA_integer_)
85151 }
86152
87- # Otherwise, take a look at the cictations
88- address <- pkgs$google_scholar[i]
153+ # We'll fetch as raw and convert.
154+ page_txt <- tryCatch({
155+ resp <- httr::GET(gs_url)
156+ if (httr::status_code(resp) != 200) {
157+ stop("HTTP status not 200")
158+ }
159+ raw_ct <- httr::content(resp, as="raw")
160+ txt <- iconv(rawToChar(raw_ct, multiple=TRUE), from="UTF-8", to="UTF-8", sub="byte")
161+ txt
162+ }, error = function(e) {
163+ return(NULL)
164+ })
89165
90- page <- tryCatch(readLines(address, warn = FALSE), error = function(e) e)
91-
92- if (inherits(page, "error"))
93- next
166+ if (is.null(page_txt)) {
167+ return(NA_integer_)
168+ }
94169
95- # Removing blocks of bold, italic, etc
96- page <- gsub("\\<[[:alnum:]_/-]+\\>" , "",page, perl = TRUE)
170+ # We'll split into lines
171+ lines <- strsplit(page_txt , "\n", fixed= TRUE)[[1]]
97172
98- citations <- which(grepl(regex, page, perl = TRUE))
99- if (!length(citations))
100- next
173+ # Remove some tags. (Might or might not help.)
174+ lines <- gsub("\\<[[:alnum:]_/-]+\\>", "", lines, perl=TRUE)
101175
102- citations <- stringr::str_extract(page[citations], "[0-9,]+(?=[\\s\\n]+results?)")
103- citations <- as.integer(gsub("[,.]", "", citations, perl = TRUE))
104- tot_citations <- tot_citations + citations
105- dat$Citations[i] <- sprintf("[%i](%s)", as.integer(citations), address)
176+ # The old code used a regex looking for something like "123 results (0.23 sec)"
177+ # e.g. "([0-9,]+)[\\s\\n]+results?[\\s\\n]+\\([\\s\\n]*[0-9]+"
178+ # But Scholar might say "About 123 results..."
179+ # So we can attempt a simpler approach:
180+ # "About X results" or "X results"
181+ re <- "About\\s+([0-9,]+)\\s+results\\s*(\\([^)]*\\))?|
182+ ([0-9,]+)\\s+results\\s*(\\([^)]*\\))?"
183+ # We'll try both capturing groups
184+ m <- regexpr(re, lines, perl=TRUE, ignore.case=TRUE)
185+ # Find the first line that matches
186+ line_idx <- which(m != -1)
187+ if (length(line_idx) == 0) {
188+ return(NA_integer_)
189+ }
190+ # We'll just pick the first match
191+ line_of_interest <- lines[line_idx[1]]
192+
193+ # Extract the numeric portion
194+ # We'll do two sub captures, so:
195+ match_txt <- regmatches(line_of_interest, m[1])
196+
197+ # We'll use a simpler approach with stringr if you prefer:
198+ library(stringr)
199+ # This pattern tries to find numbers in the text
200+ nums_found <- str_extract_all(line_of_interest, "[0-9,]+")[[1]]
201+ if (length(nums_found) == 0) {
202+ return(NA_integer_)
203+ }
106204
205+ # Convert e.g. "1,234" -> 1234
206+ cites_int <- as.integer(gsub("[^0-9]", "", nums_found[1]))
207+ cites_int
107208}
108209
109- if (tot_citations == 0L)
110- stop("There can't be 0 citations! Make sure things are running as expected!")
210+ tot_citations <- 0L
211+
212+ # Now we loop over each package row
213+ for (i in seq_len(nrow(pkgs))) {
214+
215+ pkg_name <- pkgs$name[i]
216+ sid <- if ("scholar_id" %in% names(pkgs)) pkgs$scholar_id[i] else NA_character_
217+ pubid <- if ("pubid" %in% names(pkgs)) pkgs$pubid[i] else NA_character_
218+ old_link <- if ("google_scholar" %in% names(pkgs)) pkgs$google_scholar[i] else NA_character_
219+
220+ cval <- NA_integer_
221+
222+ # 1) Try scholar approach if sid is not empty
223+ if (!is.na(sid) && nzchar(sid)) {
224+ cval <- get_scholar_citation_count(sid, pubid, pkg_name)
225+ if (!is.na(cval) && cval >= 0) {
226+ # If we got a valid integer from Scholar
227+ if (!is.na(pubid) && nzchar(pubid)) {
228+ # We have a link to the actual publication
229+ dat$Citations[i] <- sprintf("[%d](%s)", cval, old_link)
230+
231+ } else {
232+ # We only have the count, no direct pub link
233+ dat$Citations[i] <- as.character(cval)
234+ }
235+ tot_citations <- tot_citations + cval
236+ next # Done with this package
237+ }
238+ }
239+
240+ # 2) Fallback: old HTML approach using google_scholar column
241+ cval_html <- get_html_scrape_citation_count(old_link)
242+ if (!is.na(cval_html) && cval_html >= 0) {
243+ dat$Citations[i] <- sprintf("[%d](%s)", cval_html, old_link)
244+ tot_citations <- tot_citations + cval_html
245+ }
246+ }
111247```
112248
113- As of ` r Sys.Date() ` , the packages listed here have been cited ** ` r tot_citations ` ** times
114- (source: Google Scholar).
115249
116250``` {r printing, echo = FALSE}
117251knitr::kable(dat, row.names = FALSE)
118252```
119253
254+ As of ` r Sys.Date() ` , the packages listed here have been cited ** ` r tot_citations ` ** times
255+ (source: Google Scholar).
256+
120257To update this list, modify the file [ packages.csv] ( packages.csv ) . The
121258` README.md ` file is updated automatically using GitHub Actions, so there's no
122259need to "manually" recompile the README file after updating the list.
123260
261+
124262# Coding Standards
125263
1262641 . [ Coding Standards] ( wiki#coding-standards )
0 commit comments