#!/usr/bin/env python """ Python wrapper for Google web APIs This module allows you to access Google's web APIs through SOAP, to do things like search Google and get the results programmatically. Described U{here } You need a Google-provided license key to use these services. Follow the link above to get one. These functions will look in several places (in this order) for the license key: - the "license_key" argument of each function - the module-level LICENSE_KEY variable (call setLicense once to set it) - an environment variable called GOOGLE_LICENSE_KEY - a file called ".googlekey" in the current directory - a file called "googlekey.txt" in the current directory - a file called ".googlekey" in your home directory - a file called "googlekey.txt" in your home directory - a file called ".googlekey" in the same directory as google.py - a file called "googlekey.txt" in the same directory as google.py Sample usage:: >>> import google >>> google.setLicense('...') # must get your own key! >>> data = google.doGoogleSearch('python') >>> data.meta.searchTime 0.043221000000000002 >>> data.results[0].URL 'http://www.python.org/' >>> data.results[0].title 'Python Language Website' @newfield contrib: Contributors @author: Mark Pilgrim @author: Brian Landers @license: Python @version: 0.6 @contrib: David Ascher, for the install script @contrib: Erik Max Francis, for the command line interface @contrib: Michael Twomey, for HTTP proxy support @contrib: Mark Recht, for patches to support SOAPpy """ __author__ = "Mark Pilgrim (f8dy@diveintomark.org)" __version__ = "0.6" __cvsversion__ = "$Revision: 1.5 $"[11:-2] __date__ = "$Date: 2004/02/25 23:46:07 $"[7:-2] __copyright__ = "Copyright (c) 2002 Mark Pilgrim" __license__ = "Python" __credits__ = """David Ascher, for the install script Erik Max Francis, for the command line interface Michael Twomey, for HTTP proxy support""" import os, sys, getopt import GoogleSOAPFacade LICENSE_KEY = None HTTP_PROXY = None # # Constants # _url = 'http://api.google.com/search/beta2' _namespace = 'urn:GoogleSearch' _googlefile1 = ".googlekey" _googlefile2 = "googlekey.txt" _false = GoogleSOAPFacade.false _true = GoogleSOAPFacade.true _licenseLocations = ( ( lambda key: key, 'passed to the function in license_key variable' ), ( lambda key: LICENSE_KEY, 'module-level LICENSE_KEY variable (call setLicense to set it)' ), ( lambda key: os.environ.get( 'GOOGLE_LICENSE_KEY', None ), 'an environment variable called GOOGLE_LICENSE_KEY' ), ( lambda key: _contentsOf( os.getcwd(), _googlefile1 ), '%s in the current directory' % _googlefile1), ( lambda key: _contentsOf( os.getcwd(), _googlefile2 ), '%s in the current directory' % _googlefile2), ( lambda key: _contentsOf( os.environ.get( 'HOME', '' ), _googlefile1 ), '%s in your home directory' % _googlefile1), ( lambda key: _contentsOf( os.environ.get( 'HOME', '' ), _googlefile2 ), '%s in your home directory' % _googlefile2 ), ( lambda key: _contentsOf( _getScriptDir(), _googlefile1 ), '%s in the google.py directory' % _googlefile1 ), ( lambda key: _contentsOf( _getScriptDir(), _googlefile2 ), '%s in the google.py directory' % _googlefile2 ) ) ## ---------------------------------------------------------------------- ## Exceptions ## ---------------------------------------------------------------------- class NoLicenseKey(Exception): """ Thrown when the API is unable to find a valid license key. """ pass ## ---------------------------------------------------------------------- ## administrative functions (non-API) ## ---------------------------------------------------------------------- def _version(): """ Display a formatted version string for the module """ print """PyGoogle %(__version__)s %(__copyright__)s released %(__date__)s Thanks to: %(__credits__)s""" % globals() def _usage(): """ Display usage information for the command-line interface """ program = os.path.basename(sys.argv[0]) print """Usage: %(program)s [options] [querytype] query options: -k, --key= Google license key (see important note below) -1, -l, --lucky show only first hit -m, --meta show meta information -r, --reverse show results in reverse order -x, --proxy= use HTTP proxy -h, --help print this help -v, --version print version and copyright information -t, --test run test queries querytype: -s, --search= search (default) -c, --cache= retrieve cached page -p, --spelling= check spelling IMPORTANT NOTE: all Google functions require a valid license key; visit http://www.google.com/apis/ to get one. %(program)s will look in these places (in order) and use the first license key it finds: * the key specified on the command line""" % vars() for get, location in _licenseLocations[2:]: print " *", location ## ---------------------------------------------------------------------- ## utility functions (API) ## ---------------------------------------------------------------------- def setLicense(license_key): """ Set the U{Google APIs } license key @param license_key: The new key to use @type license_key: String @todo: validate the key? """ global LICENSE_KEY LICENSE_KEY = license_key def getLicense(license_key = None): """ Get the U{Google APIs } license key The key can be read from any number of locations. See the module-leve documentation for the search order. @return: the license key @rtype: String @raise NoLicenseKey: if no valid key could be found """ for get, location in _licenseLocations: rc = get(license_key) if rc: return rc _usage() raise NoLicenseKey, 'get a license key at http://www.google.com/apis/' def setProxy(http_proxy): """ Set the HTTP proxy to be used when accessing Google @param http_proxy: the proxy to use @type http_proxy: String @todo: validiate the input? """ global HTTP_PROXY HTTP_PROXY = http_proxy def getProxy(http_proxy = None): """ Get the HTTP proxy we use for accessing Google @return: the proxy @rtype: String """ return http_proxy or HTTP_PROXY def _contentsOf(dirname, filename): filename = os.path.join(dirname, filename) if not os.path.exists(filename): return None fsock = open(filename) contents = fsock.read() fsock.close() return contents def _getScriptDir(): if __name__ == '__main__': return os.path.abspath(os.path.dirname(sys.argv[0])) else: return os.path.abspath(os.path.dirname(sys.modules[__name__].__file__)) def _marshalBoolean(value): if value: return _true else: return _false def _getRemoteServer( http_proxy ): return GoogleSOAPFacade.getProxy( _url, _namespace, http_proxy ) ## ---------------------------------------------------------------------- ## search results classes ## ---------------------------------------------------------------------- class _SearchBase: def __init__(self, params): for k, v in params.items(): if isinstance(v, GoogleSOAPFacade.structType): v = GoogleSOAPFacade.toDict( v ) try: if isinstance(v[0], GoogleSOAPFacade.structType): v = [ SOAPProxy.toDict( node ) for node in v ] except: pass self.__dict__[str(k)] = v ## ---------------------------------------------------------------------- class SearchResultsMetaData(_SearchBase): """ Container class for metadata about a given search query's results. @ivar documentFiltering: is duplicate page filtering active? @ivar searchComments: human-readable informational message example:: "'the' is a very common word and was not included in your search" @ivar estimatedTotalResultsCount: estimated total number of results for this query. @ivar estimateIsExact: is estimatedTotalResultsCount an exact value? @ivar searchQuery: search string that initiated this search @ivar startIndex: index of the first result returned (zero-based) @ivar endIndex: index of the last result returned (zero-based) @ivar searchTips: human-readable informational message on how to better use Google. @ivar directoryCategories: list of categories for the search results This field is a list of dictionaries, like so:: { 'fullViewableName': 'the Open Directory category', 'specialEncoding': 'encoding scheme of this directory category' } @ivar searchTime: total search time, in seconds """ pass ## ---------------------------------------------------------------------- class SearchResult(_SearchBase): """ Encapsulates the results from a search. @ivar URL: URL @ivar title: title (HTML) @ivar snippet: snippet showing query context (HTML @ivar cachedSize: size of cached version of this result, (KB) @ivar relatedInformationPresent: is the "related:" keyword supported? Flag indicates that the "related:" keyword is supported for this URL @ivar hostName: used when filtering occurs When filtering occurs, a maximum of two results from any given host is returned. When this occurs, the second resultElement that comes from that host contains the host name in this parameter. @ivar directoryCategory: Open Directory category information This field is a dictionary with the following values:: { 'fullViewableName': 'the Open Directory category', 'specialEncoding' : 'encoding scheme of this directory category' } @ivar directoryTitle: Open Directory title of this result (or blank) @ivar summary: Open Directory summary for this result (or blank) """ pass ## ---------------------------------------------------------------------- class SearchReturnValue: """ complete search results for a single query @ivar meta: L{SearchResultsMetaData} instance for this query @ivar results: list of L{SearchResult} objects for this query """ def __init__( self, metadata, results ): self.meta = metadata self.results = results ## ---------------------------------------------------------------------- ## main functions ## ---------------------------------------------------------------------- def doGoogleSearch( q, start = 0, maxResults = 10, filter = 1, restrict='', safeSearch = 0, language = '', inputencoding = '', outputencoding = '',\ license_key = None, http_proxy = None ): """ Search Google using the SOAP API and return the results. You need a license key to call this function; see the U{Google APIs } site to get one. Then you can either pass it to this function every time, or set it globally; see the L{google} module-level docs for details. See U{http://www.google.com/help/features.html} for examples of advanced features. Anything that works at the Google web site will work as a query string in this method. You can use the C{start} and C{maxResults} parameters to page through multiple pages of results. Note that 'maxResults' is currently limited by Google to 10. See the API reference for more advanced examples and a full list of country codes and topics for use in the C{restrict} parameter, along with legal values for the C{language}, C{inputencoding}, and C{outputencoding} parameters. You can download the API documentation U{http://www.google.com/apis/download.html }. @param q: search string. @type q: String @param start: (optional) zero-based index of first desired result. @type start: int @param maxResults: (optional) maximum number of results to return. @type maxResults: int @param filter: (optional) flag to request filtering of similar results @type filter: int @param restrict: (optional) restrict results by country or topic. @type restrict: String @param safeSearch: (optional) @type safeSearch: int @param language: (optional) @type language: String @param inputencoding: (optional) @type inputencoding: String @param outputencoding: (optional) @type outputencoding: String @param license_key: (optional) the Google API license key to use @type license_key: String @param http_proxy: (optional) the HTTP proxy to use for talking to Google @type http_proxy: String @return: the search results encapsulated in an object @rtype: L{SearchReturnValue} """ license_key = getLicense( license_key ) http_proxy = getProxy( http_proxy ) remoteserver = _getRemoteServer( http_proxy ) filter = _marshalBoolean( filter ) safeSearch = _marshalBoolean( safeSearch ) data = remoteserver.doGoogleSearch( license_key, q, start, maxResults, filter, restrict, safeSearch, language, inputencoding, outputencoding ) metadata = GoogleSOAPFacade.toDict( data ) del metadata["resultElements"] metadata = SearchResultsMetaData( metadata ) results = [ SearchResult( GoogleSOAPFacade.toDict( node ) ) \ for node in data.resultElements ] return SearchReturnValue( metadata, results ) ## ---------------------------------------------------------------------- def doGetCachedPage( url, license_key = None, http_proxy = None ): """ Retrieve a page from the Google cache. You need a license key to call this function; see the U{Google APIs } site to get one. Then you can either pass it to this function every time, or set it globally; see the L{google} module-level docs for details. @param url: full URL to the page to retrieve @type url: String @param license_key: (optional) the Google API key to use @type license_key: String @param http_proxy: (optional) the HTTP proxy server to use @type http_proxy: String @return: full text of the cached page @rtype: String """ license_key = getLicense( license_key ) http_proxy = getProxy( http_proxy ) remoteserver = _getRemoteServer( http_proxy ) return remoteserver.doGetCachedPage( license_key, url ) ## ---------------------------------------------------------------------- def doSpellingSuggestion( phrase, license_key = None, http_proxy = None ): """ Get spelling suggestions from Google You need a license key to call this function; see the U{Google APIs } site to get one. Then you can either pass it to this function every time, or set it globally; see the L{google} module-level docs for details. @param phrase: word or phrase to spell-check @type phrase: String @param license_key: (optional) the Google API key to use @type license_key: String @param http_proxy: (optional) the HTTP proxy to use @type http_proxy: String @return: text of any suggested replacement, or None """ license_key = getLicense( license_key ) http_proxy = getProxy( http_proxy) remoteserver = _getRemoteServer( http_proxy ) return remoteserver.doSpellingSuggestion( license_key, phrase ) ## ---------------------------------------------------------------------- ## functional test suite (see googletest.py for unit test suite) ## ---------------------------------------------------------------------- def _test(): """ Run functional test suite. """ try: getLicense(None) except NoLicenseKey: return print "Searching for Python at google.com..." data = doGoogleSearch( "Python" ) _output( data, { "func": "doGoogleSearch"} ) print "\nSearching for 5 _French_ pages about Python, " print "encoded in ISO-8859-1..." data = doGoogleSearch( "Python", language = 'lang_fr', outputencoding = 'ISO-8859-1', maxResults = 5 ) _output( data, { "func": "doGoogleSearch" } ) phrase = "Pyhton programming languager" print "\nTesting spelling suggestions for '%s'..." % phrase data = doSpellingSuggestion( phrase ) _output( data, { "func": "doSpellingSuggestion" } ) ## ---------------------------------------------------------------------- ## Command-line interface ## ---------------------------------------------------------------------- class _OutputFormatter: def boil(self, data): if type(data) == type(u""): return data.encode("ISO-8859-1", "replace") else: return data class _TextOutputFormatter(_OutputFormatter): def common(self, data, params): if params.get("showMeta", 0): meta = data.meta for category in meta.directoryCategories: print "directoryCategory: %s" % \ self.boil(category["fullViewableName"]) for attr in [node for node in dir(meta) if \ node <> "directoryCategories" and node[:2] <> '__']: print "%s:" % attr, self.boil(getattr(meta, attr)) def doGoogleSearch(self, data, params): results = data.results if params.get("feelingLucky", 0): results = results[:1] if params.get("reverseOrder", 0): results.reverse() for result in results: for attr in dir(result): if attr == "directoryCategory": print "directoryCategory:", \ self.boil(result.directoryCategory["fullViewableName"]) elif attr[:2] <> '__': print "%s:" % attr, self.boil(getattr(result, attr)) print self.common(data, params) def doGetCachedPage(self, data, params): print data self.common(data, params) doSpellingSuggestion = doGetCachedPage def _makeFormatter(outputFormat): classname = "_%sOutputFormatter" % outputFormat.capitalize() return globals()[classname]() def _output(results, params): formatter = _makeFormatter(params.get("outputFormat", "text")) outputmethod = getattr(formatter, params["func"]) outputmethod(results, params) def main(argv): """ Command-line interface. """ if not argv: _usage() return q = None func = None http_proxy = None license_key = None feelingLucky = 0 showMeta = 0 reverseOrder = 0 runTest = 0 outputFormat = "text" try: opts, args = getopt.getopt(argv, "s:c:p:k:lmrx:hvt1", ["search=", "cache=", "spelling=", "key=", "lucky", "meta", "reverse", "proxy=", "help", "version", "test"]) except getopt.GetoptError: _usage() sys.exit(2) for opt, arg in opts: if opt in ("-s", "--search"): q = arg func = "doGoogleSearch" elif opt in ("-c", "--cache"): q = arg func = "doGetCachedPage" elif opt in ("-p", "--spelling"): q = arg func = "doSpellingSuggestion" elif opt in ("-k", "--key"): license_key = arg elif opt in ("-l", "-1", "--lucky"): feelingLucky = 1 elif opt in ("-m", "--meta"): showMeta = 1 elif opt in ("-r", "--reverse"): reverseOrder = 1 elif opt in ("-x", "--proxy"): http_proxy = arg elif opt in ("-h", "--help"): _usage() elif opt in ("-v", "--version"): _version() elif opt in ("-t", "--test"): runTest = 1 if runTest: setLicense(license_key) setProxy(http_proxy) _test() if args and not q: q = args[0] func = "doGoogleSearch" if func: results = globals()[func]( q, http_proxy=http_proxy, license_key=license_key ) _output(results, locals()) if __name__ == '__main__': main(sys.argv[1:])