Deduplicate threads on Newest forum tab #38

Merged
noah merged 2 commits from deduplicate-forum-newest into main 2024-02-16 03:56:16 +00:00
3 changed files with 173 additions and 71 deletions
Showing only changes of commit 62d56d5924 - Show all commits

View File

@ -89,9 +89,16 @@ func MapCommentPhotos(comments []*Comment) (CommentPhotoMap, error) {
)
for _, c := range comments {
if c == nil {
continue
}
IDs = append(IDs, c.ID)
}
if len(IDs) == 0 {
return result, nil
}
res := DB.Model(&CommentPhoto{}).Where("comment_id IN ?", IDs).Find(&ps)
if res.Error != nil {
return nil, res.Error

View File

@ -1,7 +1,7 @@
package models
import (
"fmt"
"sort"
"strings"
"time"
@ -25,10 +25,16 @@ type RecentPost struct {
func PaginateRecentPosts(user *User, categories []string, allComments bool, pager *Pagination) ([]*RecentPost, error) {
var (
result = []*RecentPost{}
query = (&Comment{}).Preload()
blockedUserIDs = BlockedUserIDs(user)
wheres = []string{"table_name = 'threads'"}
// Separate the WHERE clauses that involve forums/threads from the ones
// that involve comments. Rationale: if the user is getting a de-duplicated
// thread view, we'll end up running two queries - one to get all threads and
// another to get the latest comments, and the WHERE clauses need to be separate.
wheres = []string{}
placeholders = []interface{}{}
comment_wheres = []string{"table_name = 'threads'"}
comment_ph = []interface{}{}
)
if len(categories) > 0 {
@ -48,12 +54,12 @@ func PaginateRecentPosts(user *User, categories []string, allComments bool, page
// Blocked users?
if len(blockedUserIDs) > 0 {
wheres = append(wheres, "comments.user_id NOT IN ?")
placeholders = append(placeholders, blockedUserIDs)
comment_wheres = append(comment_wheres, "comments.user_id NOT IN ?")
comment_ph = append(comment_ph, blockedUserIDs)
}
// Don't show comments from banned or disabled accounts.
wheres = append(wheres, `
comment_wheres = append(comment_wheres, `
EXISTS (
SELECT 1
FROM users
@ -63,81 +69,27 @@ func PaginateRecentPosts(user *User, categories []string, allComments bool, page
`)
// Get the page of recent forum comment IDs of all time.
type scanner struct {
CommentID uint64
ThreadID *uint64
ForumID *uint64
UpdatedAt time.Time
}
var scan []scanner
var scan NewestForumPostsScanner
// Deduplicate forum threads: if one thread is BLOWING UP with replies, we should only
// mention the thread once and show the newest comment so it doesn't spam the whole page.
if config.Current.Database.IsPostgres && !allComments {
// Deduplicating is supported in Postgres but not SQLite. We also need a custom
// total count query which is 99% the same as the select query except for the
// SELECT and ORDER BY bookends.
subquery := fmt.Sprintf(`
FROM (
SELECT DISTINCT ON (threads.id)
comments.id AS comment_id,
threads.id AS thread_id,
forums.id AS forum_id,
comments.updated_at AS updated_at
FROM comments
LEFT OUTER JOIN threads ON (table_name='threads' AND table_id=threads.id)
LEFT OUTER JOIN forums ON (threads.forum_id=forums.id)
WHERE %s
ORDER BY threads.id
) AS subquery
`, strings.Join(wheres, " AND "))
query = DB.Raw(fmt.Sprintf(`
SELECT comment_id, thread_id, forum_id, updated_at
%s
ORDER BY subquery.updated_at DESC
OFFSET %d LIMIT %d
`, subquery, pager.GetOffset(), pager.PerPage), placeholders...)
// Get a count of records.
DB.Raw(fmt.Sprintf("SELECT count(*) %s", subquery), placeholders...).Count(&pager.Total)
query = query.Find(&scan)
if query.Error != nil {
return nil, query.Error
// Note: only Postgres supports this function (SELECT DISTINCT ON).
if res, err := ScanLatestForumCommentsPerThread(wheres, comment_wheres, placeholders, comment_ph, pager); err != nil {
return nil, err
} else {
scan = res
}
} else {
// SQLite/non-Postgres doesn't support DISTINCT ON, this is the old query which
// shows objectively all comments and a popular thread may dominate the page.
query = DB.Table("comments").Select(
`comments.id AS comment_id,
threads.id AS thread_id,
forums.id AS forum_id,
comments.updated_at AS updated_at`,
).Joins(
"LEFT OUTER JOIN threads ON (table_name = 'threads' AND table_id = threads.id)",
).Joins(
"LEFT OUTER JOIN forums ON (threads.forum_id = forums.id)",
).Where(
strings.Join(wheres, " AND "),
placeholders...,
).Order("comments.updated_at desc")
query.Model(&Comment{}).Count(&pager.Total)
// Execute the query.
query = query.Offset(pager.GetOffset()).Limit(pager.PerPage).Find(&scan)
if query.Error != nil {
return nil, query.Error
if res, err := ScanLatestForumCommentsAll(wheres, comment_wheres, placeholders, comment_ph, pager); err != nil {
return nil, err
} else {
scan = res
}
}
// Get the total for the pager and scan the page of ID sets.
// query.Model(&Comment{}).Count(&pager.Total)
// query = query.Offset(pager.GetOffset()).Limit(pager.PerPage).Find(&scan)
if query.Error != nil {
return nil, query.Error
}
// Ingest the results.
var (
commentIDs = []uint64{} // collect distinct IDs
@ -232,6 +184,13 @@ func PaginateRecentPosts(user *User, categories []string, allComments bool, page
}
}
// Is the new comment unavailable? (e.g. blocked, banned, disabled)
if rc.Comment == nil {
rc.Comment = &Comment{
Message: "[unavailable]",
}
}
if f, ok := forums[rc.ForumID]; ok {
rc.Forum = f
}
@ -243,3 +202,140 @@ func PaginateRecentPosts(user *User, categories []string, allComments bool, page
return result, nil
}
// NewestForumPosts collects the IDs of the latest forum posts.
type NewestForumPosts struct {
CommentID uint64
ThreadID *uint64
ForumID *uint64
UpdatedAt time.Time
}
type NewestForumPostsScanner []NewestForumPosts
// ScanLatestForumCommentsAll returns a scan of Newest forum posts containing ALL comments, which may
// include runs of 'duplicate' forum threads if a given thread was commented on rapidly. This is the classic
// 'Newest' tab behavior, showing just ALL forum comments by newest.
func ScanLatestForumCommentsAll(wheres, comment_wheres []string, placeholders, comment_ph []interface{}, pager *Pagination) (NewestForumPostsScanner, error) {
var scan NewestForumPostsScanner
// This one is all one joined query so join the wheres/placeholders.
wheres = append(wheres, comment_wheres...)
placeholders = append(placeholders, comment_ph...)
// SQLite/non-Postgres doesn't support DISTINCT ON, this is the old query which
// shows objectively all comments and a popular thread may dominate the page.
query := DB.Table("comments").Select(
`comments.id AS comment_id,
threads.id AS thread_id,
forums.id AS forum_id,
comments.updated_at AS updated_at`,
).Joins(
"LEFT OUTER JOIN threads ON (table_name = 'threads' AND table_id = threads.id)",
).Joins(
"LEFT OUTER JOIN forums ON (threads.forum_id = forums.id)",
).Where(
strings.Join(wheres, " AND "),
placeholders...,
).Order("comments.updated_at desc")
query.Model(&Comment{}).Count(&pager.Total)
// Execute the query.
query = query.Offset(pager.GetOffset()).Limit(pager.PerPage).Find(&scan)
return scan, query.Error
}
// ScanLatestForumCommentsPerThread returns a scan of Newest forum posts, deduplicated by thread.
// Each thread ID will only appear once in the result, paired with the newest comment in that
// thread.
func ScanLatestForumCommentsPerThread(wheres, comment_wheres []string, placeholders, comment_ph []interface{}, pager *Pagination) (NewestForumPostsScanner, error) {
var (
result NewestForumPostsScanner
threadIDs = []uint64{}
// Query for ALL thread IDs (in forums the user can see).
query = DB.Table(
"threads",
).Select(`
DISTINCT ON (threads.id)
threads.forum_id,
threads.id AS thread_id,
threads.updated_at AS updated_at
`).Joins(
"JOIN forums ON (threads.forum_id = forums.id)",
).Where(
strings.Join(wheres, " AND "),
placeholders...,
).Order(
"threads.id",
)
)
query = query.Find(&result)
if query.Error != nil {
return result, query.Error
}
pager.Total = int64(len(result))
// Reorder the result by timestamp.
sort.Slice(result, func(i, j int) bool {
return result[i].UpdatedAt.After(result[j].UpdatedAt)
})
// Subslice the result per the user's pagination setting.
var (
start = pager.GetOffset()
stop = start + pager.PerPage
)
if start > len(result) {
return NewestForumPostsScanner{}, nil
} else if stop > len(result) {
stop = len(result)
}
result = result[start:stop]
// Map the thread IDs to their result row.
var threadMap = map[uint64]int{}
for i, row := range result {
threadIDs = append(threadIDs, *row.ThreadID)
threadMap[*row.ThreadID] = i
}
// With these thread IDs, select the newest comments.
type scanner struct {
ThreadID uint64
CommentID uint64
}
var scan []scanner
err := DB.Table(
"comments",
).Select(
"table_id AS thread_id, id AS comment_id",
).Where(
`table_name='threads' AND table_id IN ?
AND updated_at = (SELECT MAX(updated_at)
FROM comments c2
WHERE c2.table_name=comments.table_name
AND c2.table_id=comments.table_id
)`,
threadIDs,
).Where(
strings.Join(comment_wheres, " AND "),
comment_ph...,
).Order(
"updated_at desc",
).Scan(&scan)
if err.Error != nil {
log.Error("Getting most recent post IDs: %s", err.Error)
return result, err.Error
}
// Populate the comment IDs back in.
for _, row := range scan {
if idx, ok := threadMap[row.ThreadID]; ok {
result[idx].CommentID = row.CommentID
}
}
return result, query.Error
}

View File

@ -230,7 +230,6 @@ func (ts ForumStatsMap) generateRecentPosts(IDs []uint64) {
"comments",
).Select(
"table_id AS thread_id, id AS comment_id",
// "forum_id, id AS thread_id, updated_at",
).Where(
`table_name='threads' AND table_id IN ?
AND updated_at = (SELECT MAX(updated_at)