Use Python to get URL from baidu search

# coding=utf-8

#code by xi4okv QQ:48011203 site:xiaokui.cc

import urllib2 as url

import urllib2

import string

import urllib

import re

import sys


def help():

    print "python baidu.py keyword page"

    return

    


def baidu_search(keyword,pn):

    p='wd='+keyword

    res=url.urlopen("https://www.baidu.com/s?"+p+"&pn="+str(pn))

    html=res.read()

    #    print html

    return html

        

def get_url(html):

    import re

    if html:

        urls_pat=re.compile(r'url":"(.*?)"}')

        siteUrls=re.findall(urls_pat,html)

        return siteUrls

    else:

        print "ERROR!"

        siteUrls=False


def baidu_url(xk_url):

    try:

        baidu = urllib2.urlopen("http:"+xk_url)

        

        if baidu:

            return baidu.url

        else:

            print "ERROR!"

            baidu.url=False

    except:

        print "ERROR!"

    

def main():

    help()

    fileName='result.lst'

    mode='w+'    

    f=open(fileName,mode)

    keyword = sys.argv[1]

    page = string.atoi(sys.argv[2])

    

    print 'search '+keyword+' in baidu:'

    count = 1

    while count < page+1:

        count = count + 1

        pn = 10 * count

        html = baidu_search(keyword,pn)

        urls =  get_url(html)  

        for xk_url in urls:

            if "link?url" in xk_url:

                result = baidu_url(xk_url)

                try:

                    f.write(result+"\n")

                    print result

                except:

                    print "ERROR"

if __name__=='__main__':

    main()


评论
热度 ( 1 )

© ID1536264 | Powered by LOFTER