-
Notifications
You must be signed in to change notification settings - Fork 0
/
app.ddlog
137 lines (119 loc) · 3.51 KB
/
app.ddlog
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
###########################################################
# Benjamin Lampel, Joel Dick, Wei-Tsung Lin #
# 293n 2016 #
# DDlog file for DeepDive analysis: #
# rules for determining users from non-users #
###########################################################
###########################################################
# Relations (Input Data): #
# equivalent to db table #
###########################################################
#University subreddit comments ONLY
@source
uni_sub_comments(
created_utc int,
subreddit text,
author text,
domain text,
url text,
num_comments int,
score int,
ups int,
downs int,
title text,
selftext text,
gilded int,
over_18 bool,
thumbnail text,
subreddit_id text,
is_self bool
).
###########################################################
# Random variable to predict: #
# whether a reddit post does well on a sub #
###########################################################
#random variable predicts whether the post is "good"
@extraction
is_good?(
@key
title text,
@key
@references(relation="reddit_user", column="reddit_uname", alias="reddit_user")
author text
).
reddit_user(
@key
author text,
num_posts int
).
reddit_user(author) :-
uni_sub_comments(_, _, author, _, _, _, _, _, _, _, _, _, _, _, _, _).
###########################################################
# Feature Extraction: #
# finding relevant info in comments #
###########################################################
#convert time created to an hour in the day, 24h format
time_created (
time int
).
function utc_to_hour over (
created_utc text
) returns rows like time_created
implementation "udf/utc_to_hour.py" handles tsv lines.
time_created += utc_to_hour(created_utc) :-
uni_sub_comments(created_utc, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _).
###########################################################
# Distant Supervision: #
# supervised learning from training set #
###########################################################
@extraction
post_label (
@key
@references(relation="is_good", column="title", alias="is_good")
title text,
@key
@references(relation="is_good", column="author", alias="is_good")
author text,
@navigable
selftext_len int,
@navigable
score_label int,
@navigable
time_label int,
@navigable
num_comments_label int,
@navigable
isself_label int,
@navigable
thumbnail_label int,
@navigable
rule_id text
).
#supervision in UDF
function supervise over (
title text,
author text,
selftext text,
score int,
time int,
num_comments int,
is_self bool,
thumbnail text
) returns rows like post_label
implementation "udf/supervise.py" handles tsv lines.
post_label += supervise (
title, author, selftext, score, time,
num_comments, is_self, thumbnail) :-
uni_sub_comments(
###########################################################
# Inference Rules: #
# weights that help decide if a user is real #
###########################################################
#Ups bonus:
@weight(ups)
is_good(title, author) :-
uni_sub_comments(_, _, author, _, _, _, _, ups, _, title, _, _, _, _, _, _).
#Num comments influence:
@weight(num_comments)
is_good(title, author) :-
uni_sub_comments(_, _, author, _, _, num_comments, _, _, _, title, _, _, _, _, _, _).