2

I have two tables:

CREATE TABLE t1 (
    NAME1 VARCHAR(20),
    date DATE
);

INSERT INTO t1 (NAME1, date) VALUES
('RED', '2020-01-05'),
('BLUE', '2020-03-15'),
('GREEN', '2020-06-20'),
('YELLOW', '2020-09-10'),
('PURPLE', '2020-12-25'),
('BLUE', '2020-02-20'),
('RED', '2020-07-15'),
('GREEN', '2020-11-10'),
('BLUE', '2021-02-14'),
('RED', '2021-05-01'),
('ORANGE', '2021-07-04'),
('GREEN', '2021-08-15'),
('PINK', '2021-11-20'),
('BROWN', '2021-03-10'),
('BLUE', '2021-09-22'),
('YELLOW', '2021-12-05'),
('RED', '2022-01-10'),
('PURPLE', '2022-04-22'),
('ORANGE', '2022-07-30'),
('PINK', '2022-10-15');


CREATE TABLE t2 (
    NAME2 VARCHAR(20),
    date DATE
);

INSERT INTO t2 (NAME2, date) VALUES
('BLUE', '2020-02-10'),
('GREEN', '2020-04-18'),
('YELLOW', '2020-07-22'),
('CYAN', '2020-10-05'),
('MAGENTA', '2020-11-30'),
('BLUE', '2020-05-15'),
('GREEN', '2020-08-20'),
('BLUE', '2021-01-20'),
('SILVER', '2021-03-25'),
('YELLOW', '2021-06-12'),
('GOLD', '2021-09-08'),
('CORAL', '2021-12-15'),
('YELLOW', '2021-02-28'),
('GREEN', '2022-02-28'),
('CYAN', '2022-05-14'),
('BLUE', '2022-08-19'),
('MAGENTA', '2022-09-25'),
('SILVER', '2022-11-11'),
('YELLOW', '2022-12-05'),
('CORAL', '2022-12-20'),
('CYAN', '2022-06-30');

Problem: (Using SQL) In each year, I want to find out: How many distinct names are only in t1, how many distinct names are only in t2, how many distinct names are both in table2. The intended result should look like this:

  year only_in_t1 only_in_t2 common_to_both total_distinct_t1 total_distinct_t2 total_distinct_combined
 2020          2          2              3                 5                 5                       7
 2021          5          3              2                 7                 5                      10
 2022          4          7              0                 4                 7                      11

Using the R programming language, I could have used the setdiff() function to accomplish this.

I tried to write an SQL solution, but it got very long:

WITH t1_names AS (
  SELECT DISTINCT 
    YEAR(date) as year_date,
    NAME1 as name
  FROM t1
),
t2_names AS (
  SELECT DISTINCT 
    YEAR(date) as year_date,
    NAME2 as name
  FROM t2
),
all_years AS (
  SELECT DISTINCT year_date FROM t1_names
  UNION
  SELECT DISTINCT year_date FROM t2_names
),
t1_by_year AS (
  SELECT year_date, COUNT(DISTINCT name) as total_distinct_t1
  FROM t1_names
  GROUP BY year_date
),
t2_by_year AS (
  SELECT year_date, COUNT(DISTINCT name) as total_distinct_t2
  FROM t2_names
  GROUP BY year_date
),
common_names AS (
  SELECT t1_names.year_date, COUNT(DISTINCT t1_names.name) as common_to_both
  FROM t1_names
  INNER JOIN t2_names ON t1_names.year_date = t2_names.year_date 
                      AND t1_names.name = t2_names.name
  GROUP BY t1_names.year_date
),
only_t1 AS (
  SELECT t1_names.year_date, COUNT(DISTINCT t1_names.name) as only_in_t1
  FROM t1_names
  LEFT JOIN t2_names ON t1_names.year_date = t2_names.year_date 
                     AND t1_names.name = t2_names.name
  WHERE t2_names.name IS NULL
  GROUP BY t1_names.year_date
),
only_t2 AS (
  SELECT t2_names.year_date, COUNT(DISTINCT t2_names.name) as only_in_t2
  FROM t2_names
  LEFT JOIN t1_names ON t2_names.year_date = t1_names.year_date 
                     AND t2_names.name = t1_names.name
  WHERE t1_names.name IS NULL
  GROUP BY t2_names.year_date
),
combined_names AS (
  SELECT year_date, name FROM t1_names
  UNION
  SELECT year_date, name FROM t2_names
),
combined_by_year AS (
  SELECT year_date, COUNT(DISTINCT name) as total_distinct_combined
  FROM combined_names
  GROUP BY year_date
)
SELECT 
  ay.year_date as year,
  COALESCE(ot1.only_in_t1, 0) as only_in_t1,
  COALESCE(ot2.only_in_t2, 0) as only_in_t2,
  COALESCE(cn.common_to_both, 0) as common_to_both,
  COALESCE(t1y.total_distinct_t1, 0) as total_distinct_t1,
  COALESCE(t2y.total_distinct_t2, 0) as total_distinct_t2,
  COALESCE(cby.total_distinct_combined, 0) as total_distinct_combined
FROM all_years ay
LEFT JOIN t1_by_year t1y ON ay.year_date = t1y.year_date
LEFT JOIN t2_by_year t2y ON ay.year_date = t2y.year_date
LEFT JOIN common_names cn ON ay.year_date = cn.year_date
LEFT JOIN only_t1 ot1 ON ay.year_date = ot1.year_date
LEFT JOIN only_t2 ot2 ON ay.year_date = ot2.year_date
LEFT JOIN combined_by_year cby ON ay.year_date = cby.year_date
ORDER BY ay.year_date;

Question: Is there an equivalent to the setdiff() function in SQL which would allow me to reduce the length of this code?

Thanks!


R solution:

t1$year <- year(t1$date)
t2$year <- year(t2$date)


analyze_names_by_year <- function(t1, t2) {
  
  all_years <- sort(unique(c(t1$year, t2$year)))
  
  results <- data.frame()
  
  for (yr in all_years) {
    names_t1 <- unique(t1$NAME1[t1$year == yr])
    names_t2 <- unique(t2$NAME2[t2$year == yr])
    
    only_t1 <- length(setdiff(names_t1, names_t2))
    only_t2 <- length(setdiff(names_t2, names_t1))
    common <- length(intersect(names_t1, names_t2))
    total_t1 <- length(names_t1)
    total_t2 <- length(names_t2)
    total_combined <- length(union(names_t1, names_t2))
    
    results <- rbind(results, data.frame(
      year = yr,
      only_in_t1 = only_t1,
      only_in_t2 = only_t2,
      common_to_both = common,
      total_distinct_t1 = total_t1,
      total_distinct_t2 = total_t2,
      total_distinct_combined = total_combined
    ))
  }
  
  return(results)
}


summary_by_year <- analyze_names_by_year(t1, t2)
1
  • 1
    The R solution isn't R-like and should not function as a template for SQL. Commented Oct 24 at 6:43

2 Answers 2

7

You are rather over-complicating it. There is no need to aggregate in so many different ways. This is basically a full-join of the two tables plus aggregation, with the twist that we need to pre-aggregate the tables by name and year

The various columns can be gotten with a filtered COUNT(). Note that COUNT() only counts non-null values. In some DBMSs a filtered count can be done with COUNT(*) FILTER (WHERE somcondition) but I have shown standard syntax.

SELECT
  COALESCE(t1.year, t2.year) AS year,
  COUNT(CASE WHEN t2.NAME2 IS NULL THEN 1 END) AS only_in_t1,
  COUNT(CASE WHEN t1.NAME1 IS NULL THEN 1 END) AS only_in_t2,
  COUNT(CASE WHEN t1.NAME1 IS NOT NULL AND t2.NAME2 IS NOT NULL THEN 1 END) AS common_to_both,
  COUNT(t1.NAME1) AS total_distinct_t1,
  COUNT(t2.NAME2) AS total_distinct_t2,
  COUNT(*) AS total_distinct_combined
FROM (
    SELECT DISTINCT
      DATE_TRUNC('year', date) AS year,
      NAME1
    FROM t1
) t1
FULL JOIN  (
    SELECT DISTINCT
      DATE_TRUNC('year', date) AS year,
      NAME2
    FROM t2
) t2 ON t2.NAME2 = t1.NAME1 AND t2.year = t1.year
GROUP BY
  COALESCE(t1.year, t2.year);

db<>fiddle

Sign up to request clarification or add additional context in comments.

Comments

2

We can select distinct names from both tables by "UNION". For year=20

n namex yy
1 BLUE 2020
2 BLUE 2020
2 CYAN 2020
1 GREEN 2020
2 GREEN 2020
2 MAGENTA 2020
1 PURPLE 2020
1 RED 2020
1 YELLOW 2020
2 YELLOW 2020

Then group by "year" and "name"

yy namex nn nnFl
2020 BLUE 3 both
2020 CYAN 2 t2
2020 GREEN 3 both
2020 MAGENTA 2 t2
2020 PURPLE 1 t1
2020 RED 1 t1
2020 YELLOW 3 both

Name with "nn" =3 is on both tables, nn=1 - in t1,2 - in t2.
Count totals with this flags.

See example

select yy 
  ,sum(case when nn=1 then 1 else 0 end) only_in_t1
  ,sum(case when nn=2 then 1 else 0 end) only_in_t2
  ,sum(case when nn=3 then 1 else 0 end) common_to_both
  ,sum(case when nn=1 or nn=3 then 1 else 0 end) total_distinct_t1
  ,sum(case when nn=2 or nn=3 then 1 else 0 end) total_distinct_t2
  ,count(*) total_distinct_combined
from(
  select yy,namex,sum(n) nn
  from(
    select 1 n,name1 namex,year(date) yy from t1
    union 
    select 2 n,name2 namex,year(date) yy from t2
  )a
  group by yy,namex
 )b
group by yy

Fiddle

As for analog for "setdiff()" see examples
common <- length(intersect(names_t1, names_t2))

select yy,count(*) common_to_both
from(
    select name1 namex,year(date) yy from t1
    union 
    select name2 namex,year(date) yy from t2
)a
group by yy
YY COMMON_TO_BOTH
2020 7
2021 10
2022 11

only_t1 <- length(setdiff(names_t1, names_t2))

select yy,count(*) only_in_t1
from(
    select name1 namex,year(date) yy from t1
    except
    select name2 namex,year(date) yy from t2
)a
group by yy;
YY ONLY_IN_T1
2020 2
2021 5
2022 4

only_t2 <-length(setdiff(names_t2, names_t1))

select yy,count(*) only_in_t2
from(
    select name2 namex,year(date) yy from t2
    except
    select name1 namex,year(date) yy from t1
)a
group by yy;

YY ONLY_IN_T2
2020 2
2021 3
2022 7
select yy,count(distinct namex) total_distinct_t1
from(
    select name1 namex,year(date) yy from t1
)a
group by yy

YY TOTAL_DISTINCT_T1
2020 5
2021 7
2022 4

total_combined <- length(union(names_t1, names_t2))

select yy,count(*) total_distinct_combined
from(
    select name2 namex,year(date) yy from t2
    union
    select name1 namex,year(date) yy from t1
)a
group by yy;
YY TOTAL_DISTINCT_COMBINED
2020 7
2021 10
2022 11

fiddle

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.