Web crawler

This is a small project that I use python scrapy to get stock information automatically, and connect to a telegram chat bot

Clawer

  • Base on Python scrapy, and use xpath to extract the information I need
  • Target website include
    • https://www.boursedirect.fr/
    • http://www.aastocks.com/
class BSSpider(scrapy.Spider):
    name = "usnotice"

    def __init__(self):
        self.collect = pd.DataFrame(columns=["open","close","vari"],index=["ShangHai:","GoldER:",
                                                                           "DJI  :","NASDAQ:","PSI:","sp500:","ftse:"])
        self.count = 0

    def start_requests(self):
        shanzen = "http://www.aastocks.com/tc/stocks/market/index/china-index.aspx"
        yield scrapy.Request(url = shanzen , callback = self.craw_shanzen)
        golder = "https://www.boursedirect.fr/fr/marche/chicago-mercantile-exchange/s-p-gsci-gold-index-excess-ret-SPGSGCP-USD-XCME/seance"
        yield scrapy.Request(url=golder, callback=self.craw)
        dji = "https://www.boursedirect.fr/fr/marche/new-york-stock-exchange-inc/dow-jones-industrial-average-DJI-USD-XNYS/seance"
        yield scrapy.Request(url=dji, callback=self.craw)
        nsd = "https://www.boursedirect.fr/fr/marche/nasdaq-all-markets/nasdaq-100-NDX-USD-XNAS/seance"
        yield scrapy.Request(url=nsd, callback=self.craw)
        psi = "https://www.boursedirect.fr/fr/marche/nasdaq-all-markets/phlx-semiconductor-SOX-USD-XNAS/seance"
        yield scrapy.Request(url=psi, callback=self.craw)
        sp500 = "https://www.boursedirect.fr/fr/marche/chicago-mercantile-exchange/s-p-500-SP500-USD-XCME/seance"
        yield scrapy.Request(url=sp500, callback=self.craw)
        ftse = "https://www.boursedirect.fr/fr/marche/no-market-e-g-unlisted/ftse-china-a50-index-XIN9-CNY-XXXX/seance"
        yield scrapy.Request(url=ftse, callback=self.craw)

    def craw_shanzen(self,response):
        for idx in [2]:
            table = response.xpath('//div[contains(@class , "content")]//table//tr')
            all = table[idx].xpath('td//text()').extract()
            # print(all)
            open = float(all[-2].replace(",",""))
            close = float(all[2].replace(",",""))
            vari_value = round((close-open),2)
            vari = str(vari_value) + " (" + str(round(vari_value*100/open,2))+"%)"
            combine = [open, close, vari]
  • Crawler output
  • Use windows schedule to update daily

Chat bot

    def sendbot(self,text):
        TOKEN = "MY TOKEN"
        UPDATE = "PATH OF MY BOT"
        # rtmess = urllib.parse.quote_plus(text)
        path = r"D:\python learn\scrapy\notice\notice\spiders\table.png"
        url = "https://api.telegram.org/bot"+TOKEN+"/sendPhoto"
        for ID in ["***********" ,"************" ]:
            files = {'photo': open('table.png', 'rb')}
            data = {'chat_id': ID}
            requests.post(url, files=files, data=data)

Updated: