링크 : https://code.google.com/p/pygoogle/
간단 예제
from pygoogle import pygoogle
g = pygoogle('quake 3 arena')
g.pages = 5
print '*Found %s results*'%(g.get_result_count())
g.get_urls()노트
| method | return |
| search() | returns a dict of Title/URLs |
| get_urls() | returns list of result URLs |
| get_result_count() | returns the number of results |
| display_results() | prints results (for command line) |
소스
#!/usr/bin/python
"""
Google AJAX Search Module
http://code.google.com/apis/ajaxsearch/documentation/reference.html
Needs Python 2.6 or later
"""
try:
import json
except ImportError,e:
import simplejson as json
except ImportError,e:
print e
exit()
import sys
import urllib
import logging
import argparse
__author__ = "Kiran Bandla"
__version__ = "0.2"
URL = 'http://ajax.googleapis.com/ajax/services/search/web?'
#Web Search Specific Arguments
#http://code.google.com/apis/ajaxsearch/documentation/reference.html#_fonje_web
#SAFE,FILTER
"""
SAFE
This optional argument supplies the search safety level which may be one of:
* safe=active - enables the highest level of safe search filtering
* safe=moderate - enables moderate safe search filtering (default)
* safe=off - disables safe search filtering
"""
SAFE_ACTIVE = "active"
SAFE_MODERATE = "moderate"
SAFE_OFF = "off"
"""
FILTER
This optional argument controls turning on or off the duplicate content filter:
* filter=0 - Turns off the duplicate content filter
* filter=1 - Turns on the duplicate content filter (default)
"""
FILTER_OFF = 0
FILTER_ON = 1
#Standard URL Arguments
#http://code.google.com/apis/ajaxsearch/documentation/reference.html#_fonje_args
"""
RSZ
This optional argument supplies the number of results that the application would like to recieve.
A value of small indicates a small result set size or 4 results.
A value of large indicates a large result set or 8 results. If this argument is not supplied, a value of small is assumed.
"""
RSZ_SMALL = "small"
RSZ_LARGE = "large"
"""
HL
This optional argument supplies the host language of the application making the request.
If this argument is not present then the system will choose a value based on the value of the Accept-Language http header.
If this header is not present, a value of en is assumed.
"""
class pygoogle:
def __init__(self,query,pages=10,hl='en',log_level=logging.INFO):
self.pages = pages #Number of pages. default 10
self.query = query
self.filter = FILTER_ON #Controls turning on or off the duplicate content filter. On = 1.
self.rsz = RSZ_LARGE #Results per page. small = 4 /large = 8
self.safe = SAFE_OFF #SafeBrowsing - active/moderate/off
self.hl = hl #Defaults to English (en)
self.__setup_logging(level=log_level)
def __setup_logging(self, level):
logger = logging.getLogger('pygoogle')
logger.setLevel(level)
handler = logging.StreamHandler(sys.stdout)
handler.setFormatter(logging.Formatter('%(module)s %(levelname)s %(funcName)s| %(message)s'))
logger.addHandler(handler)
self.logger = logger
def __search__(self,print_results=False):
'''
returns list of results if successful or False otherwise
'''
results = []
for page in range(0,self.pages):
rsz = 8
if self.rsz == RSZ_SMALL:
rsz = 4
args = {'q' : self.query,
'v' : '1.0',
'start' : page*rsz,
'rsz': self.rsz,
'safe' : self.safe,
'filter' : self.filter,
'hl' : self.hl
}
self.logger.debug('search: "%s" page# : %s'%(self.query, page))
q = urllib.urlencode(args)
search_results = urllib.urlopen(URL+q)
data = json.loads(search_results.read())
if not data.has_key('responseStatus'):
self.logger.error('response does not have a responseStatus key')
continue
if data.get('responseStatus') != 200:
self.logger.debug('responseStatus is not 200')
self.logger.error('responseDetails : %s'%(data.get('responseDetails', None)))
continue
if print_results:
if data.has_key('responseData') and data['responseData'].has_key('results'):
for result in data['responseData']['results']:
if result:
print '[%s]'%(urllib.unquote(result['titleNoFormatting']))
print result['content'].strip("<b>...</b>").replace("<b>",'').replace("</b>",'').replace("'","'").strip()
print urllib.unquote(result['unescapedUrl'])+'\n'
else:
# no responseData key was found in 'data'
self.logger.error('no responseData key found in response. very unusal')
results.append(data)
return results
def search(self):
"""Returns a dict of Title/URLs"""
results = {}
search_results = self.__search__()
if not search_results:
self.logger.info('No results returned')
return results
for data in search_results:
if data.has_key('responseData') and data['responseData'].has_key('results'):
for result in data['responseData']['results']:
if result and result.has_key('titleNoFormatting'):
title = urllib.unquote(result['titleNoFormatting'])
results[title] = urllib.unquote(result['unescapedUrl'])
else:
self.logger.error('no responseData key found in response')
self.logger.error(data)
return results
def search_page_wise(self):
"""Returns a dict of page-wise urls"""
results = {}
for page in range(0,self.pages):
args = {'q' : self.query,
'v' : '1.0',
'start' : page,
'rsz': RSZ_LARGE,
'safe' : SAFE_OFF,
'filter' : FILTER_ON,
}
q = urllib.urlencode(args)
search_results = urllib.urlopen(URL+q)
data = json.loads(search_results.read())
urls = []