Revisions to Database schema for discussion forum scraping

added 2 characters in body

Source Link

edited Sep 26, 2021 at 15:30

71.2k
5
76
257

create table forum_posts(
    id int identity primary key,
    created_at datetime2 not null,
    msg nvarchar(max) not null,
    scrape_time datetime2 not null default current_timestamp
);

create table fish_forum_posts(
    id int primary key references forum_posts(id)
        on update cascade on delete cascade,
        
    -- Delete this if it's in the same sequence as the other fora
    post_id int unique not null,
    
    reply_count int not null check(reply_count >= 0),
    
    reply_to_post_number int references fish_forum_posts(post_id)
        -- Needed to avoid cascade cycles
        on update no action on delete no action
);

create table golf_forum_posts(
    id int primary key references forum_posts(id)
        on update cascade on delete cascade,
        
    -- Delete this if it's in the same sequence as the other fora
    post_id int unique not null,
    
    profile_id int not null,
    
    like_count int not null check(like_count >= 0)
);

create table models(
    id int identity primary key,
    name nvarchar(256) unique not null
);

create table preprocessed_posts(
    id int primarynot keynull references forum_posts(id)
        on update cascade on delete cascade,
        
    prediction nvarchar(256) not null,
    confidence decimal(12, 4) not null check (confidence between 0 and 1),
    model int not null references models(id)
        on update cascade on delete cascade,
        
    uniqueprimary key(id, model)
);

create table forum_posts(
    id int identity primary key,
    created_at datetime2 not null,
    msg nvarchar(max) not null,
    scrape_time datetime2 not null default current_timestamp
);

create table fish_forum_posts(
    id int primary key references forum_posts(id)
        on update cascade on delete cascade,
        
    -- Delete this if it's in the same sequence as the other fora
    post_id int unique not null,
    
    reply_count int not null check(reply_count >= 0),
    
    reply_to_post_number int references fish_forum_posts(post_id)
        -- Needed to avoid cascade cycles
        on update no action on delete no action
);

create table golf_forum_posts(
    id int primary key references forum_posts(id)
        on update cascade on delete cascade,
        
    -- Delete this if it's in the same sequence as the other fora
    post_id int unique not null,
    
    profile_id int not null,
    
    like_count int not null check(like_count >= 0)
);

create table models(
    id int identity primary key,
    name nvarchar(256) unique not null
);

create table preprocessed_posts(
    id int primary key references forum_posts(id)
        on update cascade on delete cascade,
        
    prediction nvarchar(256) not null,
    confidence decimal(12, 4) not null check (confidence between 0 and 1),
    model int not null references models(id)
        on update cascade on delete cascade,
        
    unique(id, model)
);

create table forum_posts(
    id int identity primary key,
    created_at datetime2 not null,
    msg nvarchar(max) not null,
    scrape_time datetime2 not null default current_timestamp
);

create table fish_forum_posts(
    id int primary key references forum_posts(id)
        on update cascade on delete cascade,
        
    -- Delete this if it's in the same sequence as the other fora
    post_id int unique not null,
    
    reply_count int not null check(reply_count >= 0),
    
    reply_to_post_number int references fish_forum_posts(post_id)
        -- Needed to avoid cascade cycles
        on update no action on delete no action
);

create table golf_forum_posts(
    id int primary key references forum_posts(id)
        on update cascade on delete cascade,
        
    -- Delete this if it's in the same sequence as the other fora
    post_id int unique not null,
    
    profile_id int not null,
    
    like_count int not null check(like_count >= 0)
);

create table models(
    id int identity primary key,
    name nvarchar(256) unique not null
);

create table preprocessed_posts(
    id int not null references forum_posts(id)
        on update cascade on delete cascade,
        
    prediction nvarchar(256) not null,
    confidence decimal(12, 4) not null check (confidence between 0 and 1),
    model int not null references models(id)
        on update cascade on delete cascade,
        
    primary key(id, model)
);

added 34 characters in body

Source Link

edited Sep 25, 2021 at 12:04

Reinderien

71.2k
5
76
257

create table forum_posts(
    id int identity primary key,
    created_at datetime2 not null,
    msg nvarchar(max) not null,
    scrape_time datetime2 not null default current_timestamp
);

create table fish_forum_posts(
    id int primary key references forum_posts(id)
        on update cascade on delete cascade,
        
    -- Delete this if it's in the same sequence as the other fora
    post_id int unique not null,
    
    reply_count int not null check(reply_count >= 0),
    
    reply_to_post_number int references fish_forum_posts(post_id)
        -- Needed to avoid cascade cycles
        on update no action on delete no action
);

create table golf_forum_posts(
    id int primary key references forum_posts(id)
        on update cascade on delete cascade,
        
    -- Delete this if it's in the same sequence as the other fora
    post_id int unique not null,
    
    profile_id int not null,
    
    like_count int not null check(like_count >= 0)
);

create table models(
    id int identity primary key,
    name nvarchar(256) unique not null
);

create table preprocessed_posts(
    id int primary key references forum_posts(id)
        on update cascade on delete cascade,
        
    prediction nvarchar(256) not null,
    confidence decimal(12, 4) not null check (confidence between 0 and 1),
    model int not null references models(id)
        on update cascade on delete cascade,
        
    unique(id, model)
);

create table forum_posts(
    id int identity primary key,
    created_at datetime2 not null,
    msg nvarchar(max) not null,
    scrape_time datetime2 not null default current_timestamp
);

create table fish_forum_posts(
    id int primary key references forum_posts(id)
        on update cascade on delete cascade,
        
    -- Delete this if it's in the same sequence as the other fora
    post_id int unique not null,
    
    reply_count int not null check(reply_count >= 0),
    
    reply_to_post_number int references fish_forum_posts(post_id)
        -- Needed to avoid cascade cycles
        on update no action on delete no action
);

create table golf_forum_posts(
    id int primary key references forum_posts(id)
        on update cascade on delete cascade,
        
    -- Delete this if it's in the same sequence as the other fora
    post_id int unique not null,
    
    profile_id int not null,
    
    like_count int not null check(like_count >= 0)
);

create table models(
    id int identity primary key,
    name nvarchar(256) unique not null
);

create table preprocessed_posts(
    id int primary key references forum_posts(id)
        on update cascade on delete cascade,
        
    prediction nvarchar(256) not null,
    confidence decimal(12, 4) not null check (confidence between 0 and 1),
    model int not null references models(id)
        on update cascade on delete cascade
);

create table forum_posts(
    id int identity primary key,
    created_at datetime2 not null,
    msg nvarchar(max) not null,
    scrape_time datetime2 not null default current_timestamp
);

create table fish_forum_posts(
    id int primary key references forum_posts(id)
        on update cascade on delete cascade,
        
    -- Delete this if it's in the same sequence as the other fora
    post_id int unique not null,
    
    reply_count int not null check(reply_count >= 0),
    
    reply_to_post_number int references fish_forum_posts(post_id)
        -- Needed to avoid cascade cycles
        on update no action on delete no action
);

create table golf_forum_posts(
    id int primary key references forum_posts(id)
        on update cascade on delete cascade,
        
    -- Delete this if it's in the same sequence as the other fora
    post_id int unique not null,
    
    profile_id int not null,
    
    like_count int not null check(like_count >= 0)
);

create table models(
    id int identity primary key,
    name nvarchar(256) unique not null
);

create table preprocessed_posts(
    id int primary key references forum_posts(id)
        on update cascade on delete cascade,
        
    prediction nvarchar(256) not null,
    confidence decimal(12, 4) not null check (confidence between 0 and 1),
    model int not null references models(id)
        on update cascade on delete cascade,
        
    unique(id, model)
);

added 2000 characters in body

Source Link

edited Sep 25, 2021 at 2:26

Reinderien

71.2k
5
76
257

If the post_id is unique, why not make it a primary key? Sometimes it is justified to have an internally-controlled primary key like id but here the justification is unclear. One catch is that if the two forums are from different websites, their ID sequences will be different and you don't have a choice but to maintain your own primary keys.

reply_to_post_number looks like it should be a foreign key, if you are able to assume that all linked posts are present in the database. Note that Transact SQL doesn't support restrict for whatever reason, and you're limited to disabling relational integrity for this foreign key to prevent cascade cycles.

preprocessed_posts.post_id should certainly be a foreign key.

it's possible that by mistake we enter a post id and/or a forum that doesn't exist (e.g., "cars")

This is what foreign keys and relational integrity are for.

If you know more about a forum than just its name, but I'm going to propose that that column be deleted altogether and source_forumid should be factored into its own table.

if I build a view that would distill the essence (= post_id plus forum_name) would be OK

This does not sound like the right waymade to go. Nothing I've seen here calls for a view; justbe both a plain-old normalized relational schemaprimary and foreign key.

A way to cut out a large amount of redundancy between your fish and golf tables is to make a third table, FORUM_POSTS, containing only the common columns; have FISH_FORUM_POSTS and GOLF_FORUM_POSTS only contain varying columns; and have the latter two include an ID column that is both a primary key and a foreign key to FORUM_POSTS. Conceptually this is analogous to class inheritance in the OOP world. This would allow you to have a foreign key from PREPROCESSED_POSTS straight to FORUM_POSTS. One catch

it's possible that by mistake we enter a post id and/or a forum that doesn't exist (e.g., "cars")

This is that if the two forumswhat foreign keys and relational integrity are from different websitesfor. However, their ID sequences will be different and youto prevent redundancy, I don't havethink that the source forum column should exist at all. The source forum would be implied by the presence of a choice butmatching row in either of the fish or golf tables.

if I build a view that would distill the essence (= post_id plus forum_name) would be OK

This does not sound like the right way to maintain your own primary keysgo. Nothing I've seen here calls for a view; just a plain-old normalized relational schema.

Proposed

Runs fine on dbfiddle:

create table forum_posts(
    id int identity primary key,
    created_at datetime2 not null,
    msg nvarchar(max) not null,
    scrape_time datetime2 not null default current_timestamp
);

create table fish_forum_posts(
    id int primary key references forum_posts(id)
        on update cascade on delete cascade,
        
    -- Delete this if it's in the same sequence as the other fora
    post_id int unique not null,
    
    reply_count int not null check(reply_count >= 0),
    
    reply_to_post_number int references fish_forum_posts(post_id)
        -- Needed to avoid cascade cycles
        on update no action on delete no action
);

create table golf_forum_posts(
    id int primary key references forum_posts(id)
        on update cascade on delete cascade,
        
    -- Delete this if it's in the same sequence as the other fora
    post_id int unique not null,
    
    profile_id int not null,
    
    like_count int not null check(like_count >= 0)
);

create table models(
    id int identity primary key,
    name nvarchar(256) unique not null
);

create table preprocessed_posts(
    id int primary key references forum_posts(id)
        on update cascade on delete cascade,
        
    prediction nvarchar(256) not null,
    confidence decimal(12, 4) not null check (confidence between 0 and 1),
    model int not null references models(id)
        on update cascade on delete cascade
);

If the post_id is unique, why not make it a primary key? Sometimes it is justified to have an internally-controlled primary key like id but here the justification is unclear.

reply_to_post_number looks like it should be a foreign key, if you are able to assume that all linked posts are present in the database.

preprocessed_posts.post_id should certainly be a foreign key.

it's possible that by mistake we enter a post id and/or a forum that doesn't exist (e.g., "cars")

This is what foreign keys and relational integrity are for.

If you know more about a forum than just its name, source_forum should be factored into its own table.

if I build a view that would distill the essence (= post_id plus forum_name) would be OK

This does not sound like the right way to go. Nothing I've seen here calls for a view; just a plain-old normalized relational schema.

A way to cut out a large amount of redundancy between your fish and golf tables is to make a third table, FORUM_POSTS, containing only the common columns; have FISH_FORUM_POSTS and GOLF_FORUM_POSTS only contain varying columns; and have the latter two include an ID column that is both a primary key and a foreign key to FORUM_POSTS. Conceptually this is analogous to class inheritance in the OOP world. This would allow you to have a foreign key from PREPROCESSED_POSTS straight to FORUM_POSTS. One catch is that if the two forums are from different websites, their ID sequences will be different and you don't have a choice but to maintain your own primary keys.

If the post_id is unique, why not make it a primary key? Sometimes it is justified to have an internally-controlled primary key like id but here the justification is unclear. One catch is that if the two forums are from different websites, their ID sequences will be different and you don't have a choice but to maintain your own primary keys.

reply_to_post_number looks like it should be a foreign key, if you are able to assume that all linked posts are present in the database. Note that Transact SQL doesn't support restrict for whatever reason, and you're limited to disabling relational integrity for this foreign key to prevent cascade cycles.

preprocessed_posts.post_id should certainly be a foreign key, but I'm going to propose that that column be deleted altogether and id made to be both a primary and foreign key.

A way to cut out a large amount of redundancy between your fish and golf tables is to make a third table, FORUM_POSTS, containing only the common columns; have FISH_FORUM_POSTS and GOLF_FORUM_POSTS only contain varying columns; and have the latter two include an ID column that is both a primary key and a foreign key to FORUM_POSTS. Conceptually this is analogous to class inheritance in the OOP world. This would allow you to have a foreign key from PREPROCESSED_POSTS straight to FORUM_POSTS.

it's possible that by mistake we enter a post id and/or a forum that doesn't exist (e.g., "cars")

This is what foreign keys and relational integrity are for. However, to prevent redundancy, I don't think that the source forum column should exist at all. The source forum would be implied by the presence of a matching row in either of the fish or golf tables.

if I build a view that would distill the essence (= post_id plus forum_name) would be OK

This does not sound like the right way to go. Nothing I've seen here calls for a view; just a plain-old normalized relational schema.

Proposed

Runs fine on dbfiddle:

create table forum_posts(
    id int identity primary key,
    created_at datetime2 not null,
    msg nvarchar(max) not null,
    scrape_time datetime2 not null default current_timestamp
);

create table fish_forum_posts(
    id int primary key references forum_posts(id)
        on update cascade on delete cascade,
        
    -- Delete this if it's in the same sequence as the other fora
    post_id int unique not null,
    
    reply_count int not null check(reply_count >= 0),
    
    reply_to_post_number int references fish_forum_posts(post_id)
        -- Needed to avoid cascade cycles
        on update no action on delete no action
);

create table golf_forum_posts(
    id int primary key references forum_posts(id)
        on update cascade on delete cascade,
        
    -- Delete this if it's in the same sequence as the other fora
    post_id int unique not null,
    
    profile_id int not null,
    
    like_count int not null check(like_count >= 0)
);

create table models(
    id int identity primary key,
    name nvarchar(256) unique not null
);

create table preprocessed_posts(
    id int primary key references forum_posts(id)
        on update cascade on delete cascade,
        
    prediction nvarchar(256) not null,
    confidence decimal(12, 4) not null check (confidence between 0 and 1),
    model int not null references models(id)
        on update cascade on delete cascade
);

added 265 characters in body

Source Link

edited Sep 24, 2021 at 23:19

Reinderien

71.2k
5
76
257

Loading

Source Link

answered Sep 24, 2021 at 23:10

Reinderien

71.2k
5
76
257

Loading

Stack Exchange Network

Return to Answer

Proposed

Proposed