用python爬Google翻译

"借用Google翻译接口用用"

Posted by Weiwq on June 16, 2019

“爬Google翻译还是需要点技巧的“

引言

在做全球应用中,正确处理每个国家的翻译是必不可少的,也是最棘手的问题,尤其是那些文字方向是右到左的。为了保证翻译是正确的,这里需要借用Google翻译的接口,为此,特意写了一个python脚本来对接google翻译 ^_^。

1、GoogleTranslate代码(入口代码)

注意:如果返回code不为200,请更新cookies

#!/usr/bin/python
# -*- encoding:utf-8 -*-
import requests
from urllib.parse import quote

import TkParams
import json

cookies = [
    '_ga=GA1.3.1058806942.1531796090; NID=182=k5XIOE2zGfcgE2KEP4iTQWpzGcWXQpEHZhBh_3BF9lRwlnxyn24W2jnTaXfadXinVn6ZVa4Mkpk8HZS02sF7adR-6XI60kfQMEut5c9VQgZxDfgJnatiVzhS7qrHyZ4zP3bamIWHZ16BxtOPfiLeAsgxbUu9g_0XzqSAqgQp9GI; _gid=GA1.3.1828501068.1558528281; 1P_JAR=2019-5-23-0; _gat=1',
    '_ga=GA1.3.1058806942.1531796090; _gid=GA1.3.366383484.1556091451; NID=182=k5XIOE2zGfcgE2KEP4iTQWpzGcWXQpEHZhBh_3BF9lRwlnxyn24W2jnTaXfadXinVn6ZVa4Mkpk8HZS02sF7adR-6XI60kfQMEut5c9VQgZxDfgJnatiVzhS7qrHyZ4zP3bamIWHZ16BxtOPfiLeAsgxbUu9g_0XzqSAqgQp9GI; 1P_JAR=2019-4-25-0; _gat=1'
]


class constant:
    cookie_index = 0


"""
get translate url 
"""


def _get_translate_url(from_language, to_language, translate_text):
    TkParams.refresh_tkk()
    tk = TkParams.acquire(translate_text)
    key = quote(translate_text)
    sl = from_language
    tl = to_language
    url = 'https://translate.google.cn/translate_a/single?client=webapp&sl=%s&tl=%s&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca&dt=rw&dt=rm&dt=ss&dt=t&otf=1&pc=1&ssel=0&tsel=0&kc=3&tk=%s&q=%s' % (
        sl, tl, tk, key)
    return url


def _get(url):
    cookie_len = len(cookies)
    constant.cookie_index += 1
    constant.cookie_index %= cookie_len
    headers = {'content-type': 'application/json; charset=UTF-8',
               'accept-language': 'zh-CN,zh;q=0.9',
               "content-disposition": "attachment; filename=\"f.txt\"",
               'Accept-Encoding': '',
               "cookie": cookies[constant.cookie_index],
               'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',
               "accept": "text/html",
               "x-client-data": "CJe2yQEIo7bJAQjEtskBCKmdygEIqKPKAQixp8oBCOKoygEI8anKAQivrMoBGIKYygE=",
               "accept-Charset": "UTF-8"}
    try:
        res = requests.get(url, verify=True, headers=headers, timeout=3)
    except:
        return '', 404
    return res.text.encode('UTF-8').decode('UTF-8'), res.status_code


"""
获取结果列表的第一个翻译
"""


def _getTranslateFirstWord(translateJson):
    return translateJson[0][0][0].lower()


"""
翻译接口,提供外部使用
参数说明:

#from_language: 单词的语言代码,如:'auto'(自动检测语言)
#to_language: 目标语言代码,如:'en' (英语)
#word: 需要翻译的文字 
"""

def translate(from_language, to_language, word):
    url = _get_translate_url(from_language, to_language, word)
    translateRes, code = _get(url)
    if code is not 200:
        return ''
    jsonArray = json.loads(translateRes.lower())
    return _getTranslateFirstWord(jsonArray)


if __name__ == '__main__':
    print(translate('auto', 'en', '你好'))

2、翻译鉴权

由于Google翻译的接口是收费的,会对翻译内容做一个加密鉴权的限制,可以通过一下方法得到翻译词条的加密字符串,TkParams类的实现如下:

#!/usr/bin/python
# -*- encoding:utf-8 -*-
import math
import re
import requests


class constant:
    TKK = '432247.1200919766'  # this value maybe change by google ,then we need update


def rshift(val, n):
    """python port for '>>>'(right shift with padding)
    """
    return (val % 0x100000000) >> n


def _xr(a, b):
    size_b = len(b)
    c = 0
    while c < size_b - 2:
        d = b[c + 2]
        d = ord(d[0]) - 87 if 'a' <= d else int(d)
        d = rshift(a, d) if '+' == b[c + 1] else a << d
        a = a + d & 4294967295 if '+' == b[c] else a ^ d

        c += 3
    return a


def acquire(text):
    a = []
    # Convert text to ints
    for i in text:
        val = ord(i)
        if val < 0x10000:
            a += [val]
        else:
            # Python doesn't natively use Unicode surrogates, so account for those
            a += [
                math.floor((val - 0x10000) / 0x400 + 0xD800),
                math.floor((val - 0x10000) % 0x400 + 0xDC00)
            ]

    b = constant.TKK
    d = b.split('.')
    b = int(d[0]) if len(d) > 1 else 0

    # assume e means char code array
    e = []
    g = 0
    size = len(text)
    while g < size:
        l = a[g]
        # just append if l is less than 128(ascii: DEL)
        if l < 128:
            e.append(l)
        # append calculated value if l is less than 2048
        else:
            if l < 2048:
                e.append(l >> 6 | 192)
            else:
                # append calculated value if l matches special condition
                if (l & 64512) == 55296 and g + 1 < size and \
                        a[g + 1] & 64512 == 56320:
                    g += 1
                    l = 65536 + ((l & 1023) << 10) + (a[g] & 1023)  # This bracket is important
                    e.append(l >> 18 | 240)
                    e.append(l >> 12 & 63 | 128)
                else:
                    e.append(l >> 12 | 224)
                e.append(l >> 6 & 63 | 128)
            e.append(l & 63 | 128)
        g += 1
    a = b
    for i, value in enumerate(e):
        a += value
        a = _xr(a, '+-a^+6')
    a = _xr(a, '+-3^+b+-f')
    a ^= int(d[1]) if len(d) > 1 else 0
    if a < 0:  # pragma: nocover
        a = (a & 2147483647) + 2147483648
    a %= 1000000  # int(1E6)

    return '{}.{}'.format(a, a ^ b)


def _get(url):
    headers = {'content-type': 'application/json; charset=UTF-8',
               'accept-language': 'zh-CN,zh;q=0.9',
               "content-disposition": "attachment; filename=\"f.txt\"",
               'Accept-Encoding': '',
               "cookie": "_ga=GA1.3.1058806942.1531796090; _gid=GA1.3.743573119.1555898451; 1P_JAR=2019-4-22-2; NID=181=B4bCvuoYOtOPbrQB1626zFADiTQkwCg7F8AYSi1heEAi08NXZGTrYLqDyqjvmX3O_wzaVDq2SBk9gIQE2aKFiGOQqCW6PEcsHyH9gvEqnXa81gq0fhjsq_5UNXY2JWTdRdQB_Da9sAHLG-S5vcDx0SxMvx9qcX--hJmWcYzVLeU",
               'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',
               "accept": "text/html",
               "accept-Charset": "UTF-8"}
    try:
        res = requests.get(url, verify=True, headers=headers, timeout=3)
    except:
        return '', 404
    return res.text.encode('UTF-8').decode('UTF-8'), res.status_code


# refresh tkk key
def refresh_tkk():
    text, code = _get("https://translate.google.cn/")
    if code != 200:
        return False
    text = re.findall(r'tkk(.+?),', text)
    constant.TKK = re.sub(':|\'', '', text[0])
    return True

3、语言代码对照表

为了方便使用,这里整理了多国语言对应的缩写,应是比较全了, LanguageMapCode类如下

languageMapCode = {
    '检测语言': 'auto',
    '阿尔巴尼亚语': 'sq',
    '阿拉伯语': 'ar',
    '阿姆哈拉语': 'am',
    '阿塞拜疆语': 'az',
    '爱尔兰语': 'ga',
    '爱沙尼亚语': 'et',
    '巴斯克语': 'eu',
    '白俄罗斯语': 'be',
    '保加利亚语': 'bg',
    '冰岛语': 'is',
    '波兰语': 'pl',
    '波斯尼亚语': 'bs',
    '波斯语': 'fa',
    '布尔语(南非荷兰语)': 'af',
    '丹麦语': 'da',
    '德语': 'de',
    '俄语': 'ru',
    '法语': 'fr',
    '菲律宾语': 'tl',
    '芬兰语': 'fi',
    '弗里西语': 'fy',
    '高棉语': 'km',
    '格鲁吉亚语': 'ka',
    '古吉拉特语': 'gu',
    '哈萨克语': 'kk',
    '海地克里奥尔语': 'ht',
    '韩语': 'ko',
    '豪萨语': 'ha',
    '荷兰语': 'nl',
    '吉尔吉斯语': 'ky',
    '加利西亚语': 'gl',
    '加泰罗尼亚语': 'ca',
    '捷克语': 'cs',
    '卡纳达语': 'kn',
    '科西嘉语': 'co',
    '克罗地亚语': 'hr',
    '库尔德语': 'ku',
    '拉丁语': 'la',
    '拉脱维亚语': 'lv',
    '老挝语': 'lo',
    '立陶宛语': 'lt',
    '卢森堡语': 'lb',
    '罗马尼亚语': 'ro',
    '马尔加什语': 'mg',
    '马耳他语': 'mt',
    '马拉地语': 'mr',
    '马拉雅拉姆语': 'ml',
    '马来语': 'ms',
    '马其顿语': 'mk',
    '毛利语': 'mi',
    '蒙古语': 'mn',
    '孟加拉语': 'bn',
    '缅甸语': 'my',
    '苗语': 'hmn',
    '南非科萨语': 'xh',
    '南非祖鲁语': 'zu',
    '尼泊尔语': 'ne',
    '挪威语': 'no',
    '旁遮普语': 'pa',
    '葡萄牙语': 'pt',
    '普什图语': 'ps',
    '齐切瓦语': 'ny',
    '日语': 'ja',
    '瑞典语': 'sv',
    '萨摩亚语': 'sm',
    '塞尔维亚语': 'sr',
    '塞索托语': 'st',
    '僧伽罗语': 'si',
    '世界语': 'eo',
    '斯洛伐克语': 'sk',
    '斯洛文尼亚语': 'sl',
    '斯瓦希里语': 'sw',
    '苏格兰盖尔语': 'gd',
    '宿务语': 'ceb',
    '索马里语': 'so',
    '塔吉克语': 'tg',
    '泰卢固语': 'te',
    '泰米尔语': 'ta',
    '泰语': 'th',
    '土耳其语': 'tr',
    '威尔士语': 'cy',
    '乌尔都语': 'ur',
    '乌克兰语': 'uk',
    '乌兹别克语': 'uz',
    '西班牙语': 'es',
    '希伯来语': 'iw',
    '希腊语': 'el',
    '夏威夷语': 'haw',
    '信德语': 'sd',
    '匈牙利语': 'hu',
    '修纳语': 'sn',
    '亚美尼亚语': 'hy',
    '伊博语': 'ig',
    '意大利语': 'it',
    '意第绪语': 'yi',
    '印地语': 'hi',
    '印尼巽他语': 'su',
    '印尼语': 'id',
    '印尼爪哇语': 'jw',
    '英语': 'en',
    '约鲁巴语': 'yo',
    '越南语': 'vi',
    '中文': 'zh-CN',
    '中文(繁体)': 'zh-TW'
}


"""
获取语言代码
"""

def get_language_code(language):
    if language in languageMapCode:
        return languageMapCode[language]
    return ''

用法如下:

if __name__ == '__main__':
	to_language = LanguageMapCode.get_language_code('英语')
    print(GoogleTranslate.translate('auto', to_language, '你好'))

后记

处理多国翻译的工作,最好还是交给专业的翻译公司,当然,也可以参考Google翻译,毕竟Google的产品还是比较靠谱的~~

—— Weiwq 后记于 2019.06 广州