I have two tables:
CREATE TABLE t1 (
NAME1 VARCHAR(20),
date DATE
);
INSERT INTO t1 (NAME1, date) VALUES
('RED', '2020-01-05'),
('BLUE', '2020-03-15'),
('GREEN', '2020-06-20'),
('YELLOW', '2020-09-10'),
('PURPLE', '2020-12-25'),
('BLUE', '2020-02-20'),
('RED', '2020-07-15'),
('GREEN', '2020-11-10'),
('BLUE', '2021-02-14'),
('RED', '2021-05-01'),
('ORANGE', '2021-07-04'),
('GREEN', '2021-08-15'),
('PINK', '2021-11-20'),
('BROWN', '2021-03-10'),
('BLUE', '2021-09-22'),
('YELLOW', '2021-12-05'),
('RED', '2022-01-10'),
('PURPLE', '2022-04-22'),
('ORANGE', '2022-07-30'),
('PINK', '2022-10-15');
CREATE TABLE t2 (
NAME2 VARCHAR(20),
date DATE
);
INSERT INTO t2 (NAME2, date) VALUES
('BLUE', '2020-02-10'),
('GREEN', '2020-04-18'),
('YELLOW', '2020-07-22'),
('CYAN', '2020-10-05'),
('MAGENTA', '2020-11-30'),
('BLUE', '2020-05-15'),
('GREEN', '2020-08-20'),
('BLUE', '2021-01-20'),
('SILVER', '2021-03-25'),
('YELLOW', '2021-06-12'),
('GOLD', '2021-09-08'),
('CORAL', '2021-12-15'),
('YELLOW', '2021-02-28'),
('GREEN', '2022-02-28'),
('CYAN', '2022-05-14'),
('BLUE', '2022-08-19'),
('MAGENTA', '2022-09-25'),
('SILVER', '2022-11-11'),
('YELLOW', '2022-12-05'),
('CORAL', '2022-12-20'),
('CYAN', '2022-06-30');
Problem: (Using SQL) In each year, I want to find out: How many distinct names are only in t1, how many distinct names are only in t2, how many distinct names are both in table2. The intended result should look like this:
year only_in_t1 only_in_t2 common_to_both total_distinct_t1 total_distinct_t2 total_distinct_combined
2020 2 2 3 5 5 7
2021 5 3 2 7 5 10
2022 4 7 0 4 7 11
Using the R programming language, I could have used the setdiff() function to accomplish this.
I tried to write an SQL solution, but it got very long:
WITH t1_names AS (
SELECT DISTINCT
YEAR(date) as year_date,
NAME1 as name
FROM t1
),
t2_names AS (
SELECT DISTINCT
YEAR(date) as year_date,
NAME2 as name
FROM t2
),
all_years AS (
SELECT DISTINCT year_date FROM t1_names
UNION
SELECT DISTINCT year_date FROM t2_names
),
t1_by_year AS (
SELECT year_date, COUNT(DISTINCT name) as total_distinct_t1
FROM t1_names
GROUP BY year_date
),
t2_by_year AS (
SELECT year_date, COUNT(DISTINCT name) as total_distinct_t2
FROM t2_names
GROUP BY year_date
),
common_names AS (
SELECT t1_names.year_date, COUNT(DISTINCT t1_names.name) as common_to_both
FROM t1_names
INNER JOIN t2_names ON t1_names.year_date = t2_names.year_date
AND t1_names.name = t2_names.name
GROUP BY t1_names.year_date
),
only_t1 AS (
SELECT t1_names.year_date, COUNT(DISTINCT t1_names.name) as only_in_t1
FROM t1_names
LEFT JOIN t2_names ON t1_names.year_date = t2_names.year_date
AND t1_names.name = t2_names.name
WHERE t2_names.name IS NULL
GROUP BY t1_names.year_date
),
only_t2 AS (
SELECT t2_names.year_date, COUNT(DISTINCT t2_names.name) as only_in_t2
FROM t2_names
LEFT JOIN t1_names ON t2_names.year_date = t1_names.year_date
AND t2_names.name = t1_names.name
WHERE t1_names.name IS NULL
GROUP BY t2_names.year_date
),
combined_names AS (
SELECT year_date, name FROM t1_names
UNION
SELECT year_date, name FROM t2_names
),
combined_by_year AS (
SELECT year_date, COUNT(DISTINCT name) as total_distinct_combined
FROM combined_names
GROUP BY year_date
)
SELECT
ay.year_date as year,
COALESCE(ot1.only_in_t1, 0) as only_in_t1,
COALESCE(ot2.only_in_t2, 0) as only_in_t2,
COALESCE(cn.common_to_both, 0) as common_to_both,
COALESCE(t1y.total_distinct_t1, 0) as total_distinct_t1,
COALESCE(t2y.total_distinct_t2, 0) as total_distinct_t2,
COALESCE(cby.total_distinct_combined, 0) as total_distinct_combined
FROM all_years ay
LEFT JOIN t1_by_year t1y ON ay.year_date = t1y.year_date
LEFT JOIN t2_by_year t2y ON ay.year_date = t2y.year_date
LEFT JOIN common_names cn ON ay.year_date = cn.year_date
LEFT JOIN only_t1 ot1 ON ay.year_date = ot1.year_date
LEFT JOIN only_t2 ot2 ON ay.year_date = ot2.year_date
LEFT JOIN combined_by_year cby ON ay.year_date = cby.year_date
ORDER BY ay.year_date;
Question: Is there an equivalent to the setdiff() function in SQL which would allow me to reduce the length of this code?
Thanks!
R solution:
t1$year <- year(t1$date)
t2$year <- year(t2$date)
analyze_names_by_year <- function(t1, t2) {
all_years <- sort(unique(c(t1$year, t2$year)))
results <- data.frame()
for (yr in all_years) {
names_t1 <- unique(t1$NAME1[t1$year == yr])
names_t2 <- unique(t2$NAME2[t2$year == yr])
only_t1 <- length(setdiff(names_t1, names_t2))
only_t2 <- length(setdiff(names_t2, names_t1))
common <- length(intersect(names_t1, names_t2))
total_t1 <- length(names_t1)
total_t2 <- length(names_t2)
total_combined <- length(union(names_t1, names_t2))
results <- rbind(results, data.frame(
year = yr,
only_in_t1 = only_t1,
only_in_t2 = only_t2,
common_to_both = common,
total_distinct_t1 = total_t1,
total_distinct_t2 = total_t2,
total_distinct_combined = total_combined
))
}
return(results)
}
summary_by_year <- analyze_names_by_year(t1, t2)
Rsolution isn'tR-like and should not function as a template for SQL.