Python crawlers: commonly used translation sites

Request library based on Python to write several translation site crawler, now will share the source code, need friends can directly take to use. The code has been open-source to GitHub:github.com/hy-struggle… Interested friends can communicate and make progress together

The results show

powerword

Kingsoft Powerword is very easy to climb, directly post a few parameters can be.

# post request
import json

import requests


class King:
    def __init__(self, word):
        self.word = word
        self.url = 'http://fy.iciba.com/ajax.php?a=fy'
        self.headers = {
            'User-Agent': 'the Mozilla / 5.0 (Windows NT 6.1; Win64; x64) '
                          'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
        }
        Construct the parameters of the POST request
        self.post_data = {
            'f': 'auto'.'t': 'auto'.'w': self.word
        }

    # send request
    def request_post(self):
        res = requests.post(url=self.url, headers=self.headers, data=self.post_data)
        # print(res.content.decode())
        return res.content.decode()

    # parse data
    @staticmethod
    def parse_data(data):
        dict_data = json.loads(data)
        if 'out' in dict_data['content'] :print(dict_data['content'] ['out'])
        elif 'word_mean' in dict_data['content'] :print(dict_data['content'] ['word_mean'])

    def run(self):
        data = self.request_post()
        self.parse_data(data)


if __name__ == '__main__':
    word = input("Translation:")
    king = King(word)
    king.run()
Copy the code

The bing translation

Bing is also very simple, but requires an additional function that automatically adjusts the POST parameter to determine whether the input is Chinese or English.

# post request
import json

import requests


class Biying:
    def __init__(self, word):
        self.word = word
        self.url = 'https://cn.bing.com/ttranslatev3?'
        # self. Url = 'https://cn.bing.com/ttranslatev3?isVertical=1&&IG=E3F2E74779804936A4B134F621FE89FB&IID=translator.5028.12'
        self.headers = {
            'User-Agent': 'the Mozilla / 5.0 (Windows NT 6.1; Win64; x64) '
                          'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
        }
        Construct the parameters of the POST request
        self.post_data = {
            'fromLang': 'auto-detect'.'to': 'zh-Hans'.'text': self.word
        }

    # check the post parameter
    def judge_post(self):
        if self.is_chinese(self.word):
            self.post_data['to'] = 'en'
            # print(self.word.encode().isalpha())

    # Check whether it is a Chinese character
    @staticmethod
    def is_chinese(uchar):
        if u'\u4e00' <= uchar <= u'\u9fa5':
            return True
        else:
            return False

    # send request
    def request_post(self):
        res = requests.post(url=self.url, headers=self.headers, data=self.post_data)
        # print(res.content.decode())
        return res.content.decode()

    # parse data
    @staticmethod
    def parse_data(data):
        dict_data = json.loads(data)
        print(dict_data[0]['translations'] [0] ['text'])

    def run(self):
        self.judge_post()
        data = self.request_post()
        self.parse_data(data)
        # dict_data = json.loads(data)
        # print(dict_data)


if __name__ == '__main__':
    word = input("Translation:")
    by = Biying(word)
    by.run()
Copy the code

Baidu translation

Baidu’s is relatively difficult, the need for POST parameters cookie,simple_means_flag,sign and token. The cookie parameter and token parameter are corresponding, while the sign parameter needs to be parsed out by calling the EXECJS library of the web page.

# post request
import json
import execjs
import requests

"""1. Cookie parameter and token parameter are corresponding 2. The generation of sign parameter needs to call biduo.js program"""


class Baidu:

    def __init__(self, word):
        self.word = word
        self.sign = self.get_sign()
        self.url = 'https://fanyi.baidu.com/v2transapi'
        self.headers = {
            'User-Agent': 'the Mozilla / 5.0 (Windows NT 6.1; Win64; x64) '
                          'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'.'cookie': 'BIDUPSID=EF5D2DCB95CD02713C504B965E680572; PSTM=1508391259; '
                      'BAIDUID=FE94A1C6870007735C0EA30CA092352A:FG=1; '
                      'BDUSS=HhpVTc3VjZrQ2ppRX5RcVFoQW9-WExTQ29zYWR-'
                      'TUluOUQxRGVaWHZrWGlOWmRkRVFBQUFBJCQAAAAAAAAAAAEAAAAUxiG2ZnJlZc31vNG'
                      '~pQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA'
                      'OKob13iqG9dW; locale=zh; __guid = 37525047.783289347368707300.1568961749022.282; '
                      'REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH=1; HISTORY_SWITCH=1; SOUND_SPD_SWITCH=1; '
                      'SOUND_PREFER_SWITCH=1; to_lang_often=%5B%7B%22value%22%3A%22zh%22%2C%22text%22%3A%22%u'
                      '4E2D%u6587%22%7D%2C%7B%22value%22%3A%22en%22%2C%22text%22%3A%22%u82F1%u8BED%22%7D%5D; '
                      ' from_lang_often=%5B%7B%22value%22%3A%22en%22%2C%22text%22%3A%22%u82F1%u8BED%22%7D%2C'
                      '%7B%22value%22%3A%22zh%22%2C%22text%22%3A%22%u4E2D%u6587%22%7D%5D; yjs_js_security_pass'
                      'port=67080cbdf7d8d4ad0eb8f1513b5feb52c128c29b_1569324592_js; monitor_count=3; Hm_lvt_'
                      '64ecd82404c51e03dc91cb9e8c025574=1568961749,1569324577,1569324592,1569324674; Hm_lpvt_'
                      '64ecd82404c51e03dc91cb9e8c025574=1569324674; __yjsv5_shitong = 1.0 _7_9055159b9a5e975fcd2c2 '
                      'c48931b3bc7b406_300_1569324677995_117. 32.216.70 _70981334'
        }

        Construct the parameters of the POST request
        self.post_data = {
            'from': 'en'.'to': 'zh'.'query': self.word,
            'simple_means_flag': '3'.'sign': self.sign,
            'token': '8d588b57816e1213f2bcfaf52bddbbe2'
        }

    # get sign
    def get_sign(self):
        query = self.word  # is the content to translate
        with open('baidu.js'.'r', encoding='utf-8') as f:
            ctx = execjs.compile(f.read())
        sign = ctx.call('e', query)
        # print(sign)
        return sign

    # send request
    def request_post(self):
        res = requests.post(url=self.url, headers=self.headers, data=self.post_data)
        # print(res.content.decode())
        json_data = json.loads(res.content.decode())
        return json_data

    # check the post parameter
    def judge_post(self):
        if self.is_chinese(self.word):
            self.post_data['from'] = 'zh'
            self.post_data['to'] = 'en'
            # print(self.word.encode().isalpha())

    # Check whether it is a Chinese character
    @staticmethod
    def is_chinese(uchar):
        if u'\u4e00' <= uchar <= u'\u9fa5':
            return True
        else:
            return False

    # parse data
    @staticmethod
    def parse_data(data):
        # dict_data = json.loads(data)
        print(data['trans_result'] ['data'] [0] ['dst'])

    def run(self):
        self.judge_post()
        json_data = self.request_post()
        self.parse_data(json_data)
        # print(data)


if __name__ == '__main__':
    word = input("Translation:")
    baidu = Baidu(word)
    baidu.run()
Copy the code

Google translation

Google Translate is different from Baidu Translate in that crawlers are implemented through GET requests. Token needs to be added in URL, and token is obtained through JS parsing, just like Baidu crawler.

# post request
import json
import execjs
import requests


class Google:
    def __init__(self, word):
        self.word = word
        self.tk = self.get_tk()
        self.sl = 'en'
        self.tl = 'zh-CN'
        self.url = "http://translate.google.cn/translate_a/single?client=t" \
                   "&sl=%s&tl=%s&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \
                   "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \
                   "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (self.sl, self.tl, self.tk, self.word)
        self.headers = {
            'User-Agent': 'the Mozilla / 5.0 (Windows NT 6.1; Win64; x64) '
                          'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
        }

    # Check whether it is a Chinese character
    @staticmethod
    def is_chinese(uchar):
        if u'\u4e00' <= uchar <= u'\u9fa5':
            return True
        else:
            return False

    # determine the URL parameter
    def judge_url(self):
        if self.is_chinese(self.word):
            self.sl = 'zh-CN'
            self.tl = 'en'
            self.url = "http://translate.google.cn/translate_a/single?client=t" \
                       "&sl=%s&tl=%s&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \
                       "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \
                       "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (self.sl, self.tl, self.tk, self.word)
            # print(self.word.encode().isalpha())

    Call Google.js to get tk
    def get_tk(self):
        query = self.word
        with open('google.js'.'r', encoding='utf-8') as f:
            ctx = execjs.compile(f.read())
        tk = ctx.call('TL', query)
        # print(sign)
        return tk

    # send request
    def request_get(self):
        res = requests.get(url=self.url, headers=self.headers)
        # print(res.content.decode())
        json_data = json.loads(res.content.decode())
        return json_data

    # parse data
    @staticmethod
    def parse_data(data):
        print(data[0][0][0])

    def run(self):
        self.judge_url()
        # print(self.url)
        # print('sl:%s' % self.sl)
        # print('tl:%s' % self.tl)
        json_data = self.request_get()
        self.parse_data(json_data)
        # print(json_data)
        # self.parse_data(data)


if __name__ == '__main__':
    word = input("Translation:")
    google = Google(word)
    google.run()
Copy the code

Python crawlers: commonly used translation sites

The results show

powerword

The bing translation

Baidu translation

Google translation

Related Posts

How to use caching correctly in distributed systems? Don’t introduce a time bomb into your project!

Interesting story master class and object, ask about the university of shoe factory!

Basic Usage of mongodb