Skip to content

Commit

Permalink
Expanded our labeled input based on errors identified in a preliminar…
Browse files Browse the repository at this point in the history
…y drudge domain analysis
  • Loading branch information
palewire committed Aug 23, 2022
1 parent 93c883b commit b0edab0
Show file tree
Hide file tree
Showing 10 changed files with 2,744 additions and 2,621 deletions.
260 changes: 133 additions & 127 deletions Pipfile.lock

Large diffs are not rendered by default.

4,573 changes: 2,335 additions & 2,238 deletions _notebooks/input/labeled.csv

Large diffs are not rendered by default.

222 changes: 122 additions & 100 deletions _notebooks/label.ipynb

Large diffs are not rendered by default.

Binary file modified _notebooks/output/path-and-text-model.pickle
Binary file not shown.
Binary file modified _notebooks/output/path-only-model.pickle
Binary file not shown.
121 changes: 62 additions & 59 deletions _notebooks/output/wrong.csv

Large diffs are not rendered by default.

187 changes: 90 additions & 97 deletions _notebooks/train.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -136,16 +136,16 @@
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 2803 entries, 0 to 2802\n",
"RangeIndex: 2891 entries, 0 to 2890\n",
"Data columns (total 4 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 text 2803 non-null object\n",
" 1 url 2803 non-null object\n",
" 2 handle 2803 non-null object\n",
" 3 is_story 2803 non-null int64 \n",
" 0 text 2891 non-null object\n",
" 1 url 2891 non-null object\n",
" 2 handle 2891 non-null object\n",
" 3 is_story 2891 non-null int64 \n",
"dtypes: int64(1), object(3)\n",
"memory usage: 87.7+ KB\n"
"memory usage: 90.5+ KB\n"
]
}
],
Expand Down Expand Up @@ -189,57 +189,57 @@
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>A Tradition of Violence</td>\n",
" <td>https://knock-la.com/tradition-of-violence-las...</td>\n",
" <td>knockdotla</td>\n",
" <td>Trouble in Kenya's Flower Fields</td>\n",
" <td>https://100r.org/2017/12/trouble-in/</td>\n",
" <td>100reporters</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>8\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t \\t\\tBad Education</td>\n",
" <td>https://jewishcurrents.org/bad-education</td>\n",
" <td>jewishcurrents</td>\n",
" <td>0</td>\n",
" <td>Asylum for Sale Refugees Say Some U.N. Workers...</td>\n",
" <td>https://100r.org/2019/04/unhcr-corruption-refu...</td>\n",
" <td>100reporters</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>69\\n\\n\\t\\t\\t\\t\\t\\tView Slide Show</td>\n",
" <td>https://www.nationalreview.com/photos/russia-u...</td>\n",
" <td>nro</td>\n",
" <td>Documentaries as AdvertisingCorporate Interest...</td>\n",
" <td>https://100r.org/2019/12/documentaries-as-adve...</td>\n",
" <td>100reporters</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>20 Under 40</td>\n",
" <td>https://www.lagrangenews.com/20-under-40/</td>\n",
" <td>lagrangenews</td>\n",
" <td>Pandemic Drives Wave of Property Grabs in Zambia</td>\n",
" <td>https://100r.org/2020/12/propertygrabs/</td>\n",
" <td>100reporters</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Bela’s Pilgrim</td>\n",
" <td>https://jewishcurrents.org/belas-pilgrim</td>\n",
" <td>jewishcurrents</td>\n",
" <td>Did Industry Funding Influence an FDA Investig...</td>\n",
" <td>https://100r.org/2022/07/did-industry-funding-...</td>\n",
" <td>100reporters</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" text \\\n",
"0 A Tradition of Violence \n",
"1 8\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t \\t\\tBad Education \n",
"2 69\\n\\n\\t\\t\\t\\t\\t\\tView Slide Show \n",
"3 20 Under 40 \n",
"4 Bela’s Pilgrim \n",
" text \\\n",
"0 Trouble in Kenya's Flower Fields \n",
"1 Asylum for Sale Refugees Say Some U.N. Workers... \n",
"2 Documentaries as AdvertisingCorporate Interest... \n",
"3 Pandemic Drives Wave of Property Grabs in Zambia \n",
"4 Did Industry Funding Influence an FDA Investig... \n",
"\n",
" url handle is_story \n",
"0 https://knock-la.com/tradition-of-violence-las... knockdotla 1 \n",
"1 https://jewishcurrents.org/bad-education jewishcurrents 0 \n",
"2 https://www.nationalreview.com/photos/russia-u... nro 1 \n",
"3 https://www.lagrangenews.com/20-under-40/ lagrangenews 1 \n",
"4 https://jewishcurrents.org/belas-pilgrim jewishcurrents 1 "
" url handle is_story \n",
"0 https://100r.org/2017/12/trouble-in/ 100reporters 1 \n",
"1 https://100r.org/2019/04/unhcr-corruption-refu... 100reporters 1 \n",
"2 https://100r.org/2019/12/documentaries-as-adve... 100reporters 1 \n",
"3 https://100r.org/2020/12/propertygrabs/ 100reporters 1 \n",
"4 https://100r.org/2022/07/did-industry-funding-... 100reporters 1 "
]
},
"execution_count": 8,
Expand Down Expand Up @@ -344,86 +344,79 @@
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>A Tradition of Violence</td>\n",
" <td>https://knock-la.com/tradition-of-violence-las...</td>\n",
" <td>knockdotla</td>\n",
" <td>Trouble in Kenya's Flower Fields</td>\n",
" <td>https://100r.org/2017/12/trouble-in/</td>\n",
" <td>100reporters</td>\n",
" <td>1</td>\n",
" <td>/tradition-of-violence-lasd-gang-history/</td>\n",
" <td>knock-la</td>\n",
" <td>/2017/12/trouble-in/</td>\n",
" <td>100r</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>8\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t \\t\\tBad Education</td>\n",
" <td>https://jewishcurrents.org/bad-education</td>\n",
" <td>jewishcurrents</td>\n",
" <td>0</td>\n",
" <td>/bad-education</td>\n",
" <td>jewishcurrents</td>\n",
" <td>Asylum for Sale Refugees Say Some U.N. Workers...</td>\n",
" <td>https://100r.org/2019/04/unhcr-corruption-refu...</td>\n",
" <td>100reporters</td>\n",
" <td>1</td>\n",
" <td>/2019/04/unhcr-corruption-refugee-resettlement/</td>\n",
" <td>100r</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>69\\n\\n\\t\\t\\t\\t\\t\\tView Slide Show</td>\n",
" <td>https://www.nationalreview.com/photos/russia-u...</td>\n",
" <td>nro</td>\n",
" <td>Documentaries as AdvertisingCorporate Interest...</td>\n",
" <td>https://100r.org/2019/12/documentaries-as-adve...</td>\n",
" <td>100reporters</td>\n",
" <td>1</td>\n",
" <td>/photos/russia-ukraine-war-week-22/</td>\n",
" <td>nationalreview</td>\n",
" <td>www</td>\n",
" <td>/2019/12/documentaries-as-advertising/</td>\n",
" <td>100r</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>20 Under 40</td>\n",
" <td>https://www.lagrangenews.com/20-under-40/</td>\n",
" <td>lagrangenews</td>\n",
" <td>Pandemic Drives Wave of Property Grabs in Zambia</td>\n",
" <td>https://100r.org/2020/12/propertygrabs/</td>\n",
" <td>100reporters</td>\n",
" <td>1</td>\n",
" <td>/20-under-40/</td>\n",
" <td>lagrangenews</td>\n",
" <td>www</td>\n",
" <td>/2020/12/propertygrabs/</td>\n",
" <td>100r</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Bela’s Pilgrim</td>\n",
" <td>https://jewishcurrents.org/belas-pilgrim</td>\n",
" <td>jewishcurrents</td>\n",
" <td>Did Industry Funding Influence an FDA Investig...</td>\n",
" <td>https://100r.org/2022/07/did-industry-funding-...</td>\n",
" <td>100reporters</td>\n",
" <td>1</td>\n",
" <td>/belas-pilgrim</td>\n",
" <td>jewishcurrents</td>\n",
" <td>/2022/07/did-industry-funding-influence-an-fda...</td>\n",
" <td>100r</td>\n",
" <td></td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" text \\\n",
"0 A Tradition of Violence \n",
"1 8\\n\\n\\t\\t\\t\\t\\t\\t\\t\\t\\t \\t\\tBad Education \n",
"2 69\\n\\n\\t\\t\\t\\t\\t\\tView Slide Show \n",
"3 20 Under 40 \n",
"4 Bela’s Pilgrim \n",
"\n",
" url handle \\\n",
"0 https://knock-la.com/tradition-of-violence-las... knockdotla \n",
"1 https://jewishcurrents.org/bad-education jewishcurrents \n",
"2 https://www.nationalreview.com/photos/russia-u... nro \n",
"3 https://www.lagrangenews.com/20-under-40/ lagrangenews \n",
"4 https://jewishcurrents.org/belas-pilgrim jewishcurrents \n",
" text \\\n",
"0 Trouble in Kenya's Flower Fields \n",
"1 Asylum for Sale Refugees Say Some U.N. Workers... \n",
"2 Documentaries as AdvertisingCorporate Interest... \n",
"3 Pandemic Drives Wave of Property Grabs in Zambia \n",
"4 Did Industry Funding Influence an FDA Investig... \n",
"\n",
" is_story path domain \\\n",
"0 1 /tradition-of-violence-lasd-gang-history/ knock-la \n",
"1 0 /bad-education jewishcurrents \n",
"2 1 /photos/russia-ukraine-war-week-22/ nationalreview \n",
"3 1 /20-under-40/ lagrangenews \n",
"4 1 /belas-pilgrim jewishcurrents \n",
" url handle is_story \\\n",
"0 https://100r.org/2017/12/trouble-in/ 100reporters 1 \n",
"1 https://100r.org/2019/04/unhcr-corruption-refu... 100reporters 1 \n",
"2 https://100r.org/2019/12/documentaries-as-adve... 100reporters 1 \n",
"3 https://100r.org/2020/12/propertygrabs/ 100reporters 1 \n",
"4 https://100r.org/2022/07/did-industry-funding-... 100reporters 1 \n",
"\n",
" subdomain \n",
"0 \n",
"1 \n",
"2 www \n",
"3 www \n",
"4 "
" path domain subdomain \n",
"0 /2017/12/trouble-in/ 100r \n",
"1 /2019/04/unhcr-corruption-refugee-resettlement/ 100r \n",
"2 /2019/12/documentaries-as-advertising/ 100r \n",
"3 /2020/12/propertygrabs/ 100r \n",
"4 /2022/07/did-industry-funding-influence-an-fda... 100r "
]
},
"execution_count": 12,
Expand Down Expand Up @@ -719,12 +712,12 @@
"text": [
" precision recall f1-score support\n",
"\n",
" 0 0.98 0.95 0.96 550\n",
" 1 0.92 0.96 0.94 358\n",
" 0 0.96 0.94 0.95 539\n",
" 1 0.92 0.94 0.93 398\n",
"\n",
" accuracy 0.95 908\n",
" macro avg 0.95 0.96 0.95 908\n",
"weighted avg 0.95 0.95 0.95 908\n",
" accuracy 0.94 937\n",
" macro avg 0.94 0.94 0.94 937\n",
"weighted avg 0.94 0.94 0.94 937\n",
"\n"
]
}
Expand Down Expand Up @@ -778,12 +771,12 @@
"text": [
" precision recall f1-score support\n",
"\n",
" 0 0.97 0.95 0.96 550\n",
" 1 0.92 0.95 0.94 358\n",
" 0 0.94 0.94 0.94 539\n",
" 1 0.92 0.92 0.92 398\n",
"\n",
" accuracy 0.95 908\n",
" macro avg 0.95 0.95 0.95 908\n",
"weighted avg 0.95 0.95 0.95 908\n",
" accuracy 0.93 937\n",
" macro avg 0.93 0.93 0.93 937\n",
"weighted avg 0.93 0.93 0.93 937\n",
"\n"
]
}
Expand Down
2 changes: 2 additions & 0 deletions storysniffer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ class StorySniffer:
PATH_BLACKLIST = (
"",
"/",
"/privacy/",
"/about/",
)

EXT_BLACKLIST = (
Expand Down
Binary file modified storysniffer/path-and-text-model.pickle
Binary file not shown.
Binary file modified storysniffer/path-only-model.pickle
Binary file not shown.

0 comments on commit b0edab0

Please sign in to comment.