from forum_spider.spiders.ptt import PttSpider
from forum_spider.spiders.dcard import DcardSpider
from forum_spider.spiders.gamer import GamerSpider
from scrapy.crawler import CrawlerProcess
proc = CrawlerProcess()
proc.crawl(GamerSpider, board_bsn=[60076], max_page=1)
proc.crawl(PttSpider, board=['Gossiping'], max_page=1)
proc.start()
- board_bsn: list
- max_page: int
- board: list
- max_page: int
- board: list
- max_page: int
- title
- forum
- text
- board
- author
- url
- create_date
- last_update_date
- ws_pos
- ner
- set in spiders/custom_settings.py
- drop article has wrong
- text
- title
- url
- cut / pos / ner
- output:
- 'ws_pos': {'article': [pair('word', 'pos'), ...], 'title': [pair('word', 'pos'), ...]}
- 'ner': [(from_index, to_index, 'NER_Category', word), ..]
- DB_HOST / DB_NAME: settings.py
- DOCUMENT: spiders/custom_settings.py