Classe PHP para obter palavras usadas para busca

Artigo que apresenta a implementação de uma classe PHP que determina as palavras usadas nos mecanismos de busca a partir da URL de referência (HTTP_REFERER).

Classe para obter as palavras usadas para buscar um site. Basta usar o método getWords.

Linguagem: PHP

Copyright 2010 Rubens Takiguti Ribeiro

Licença: LGPL 3 ou superior

/**
 * Class SearchWords
 * @author Rubens Takiguti Ribeiro
 */
class SearchWords {

    /**
     * Gets key-words used in search sites by its url.
     * @param string $referer_url URL to be checked
     * @return string Words
     */
    public static function getWords($referer_url) {
        $url_data = parse_url($referer_url);
        if ($url_data === false) {
            throw new InvalidArgumentException('Invalid url: '.$referer_url, 1);
        }
        if (!isset($url_data['host'])) {
            return '';
        }
        $host_data = self::parseHost($url_data['host']);
        switch ($host_data['main_domain']) {

        // Based on query string
        case 'google':
        case 'bing':
        case 'altavista':
        case 'aol':
        case 'galaxy':
        case 'dibdabdoo':
        case 'gigablast':
        case 'alexa':
        case 'blogscope':
        case 'icerocket':
        case 'sphere':
        case 'technorati':
        case 'freebooksearch':
        case 'exalead':
        case 'nstein':
        case 'oracle':
        case 'cheatsearch':
        case 'mahalo':
        case 'rollyo':
        case 'trexy':
        case 'accoona':
        case 'alleba':
        case 'ansearch':
        case 'daum':
        case 'guruji':
        case 'najdi':
        case 'sapo':
        case 'search':
        case 'walla':
            return self::getQueryParam('q', $url_data['query']);
        case 'yahoo':
            return self::getQueryParam('p', $url_data['query']);
        case 'email-search';
            return self::getQueryParam('s', $url_data['query']);
        case 'saic':
        case 'onet':
            return self::getQueryParam('qt', $url_data['query']);
        case 'baidu':
            return self::getQueryParam('wd', $url_data['query']);
        case 'goo':
        case 'rediff':
            return self::getQueryParam('MT', $url_data['query']);
        case 'hotbot':
        case 'lycos':
        case 'autonomy':
        case 'funnelback':
        case 'vivisimo':
        case 'naver':
        case 'rambler':
            return self::getQueryParam('query', $url_data['query']);
        case 'kidsclick':
            return self::getQueryParam('keywords', $url_data['query']);
        case 'askmenow':
            return self::getQueryParam('Keywords', $url_data['query']);
        case 'souq':
            return self::getQueryParam('s_keyword', $url_data['query']);
        case 'ifac':
            return self::getQueryParam('search', $url_data['query']);
        case 'alibaba':
            return self::getQueryParam('SearchText', $url_data['query']);
        case 'mymcpl':
            return self::getQueryParam('searchq', $url_data['query']);
        case 'youtube':
            return self::getQueryParam('search_query', $url_data['query']);
        case 'blogperfect':
            return self::getQueryParam('tsearch', $url_data['query']);
        case 'dieselpoint':
            return self::getQueryParam('simplequerystring', $url_data['query']);
        case 'yandex':
            return self::getQueryParam('text', $url_data['query']);
        case 'awesomelibrary':
            return self::getQueryParam('terms', $url_data['query']);
        case 'endeca':
            return self::getQueryParam('Nrt', $url_data['query']);

        // Based on path
        case 'omgili':
        case 'eurekster':
        case 'wink':
        case 'miner':
            return urldecode($url_data['path']);
        case 'wikipedia':
            return substr($url_data['path'], strrpos($url_data['path'], '/') + 1);
        case 'excite':
            $path = explode('/', $url_data['path']);
            $pos = array_search('Web', $path);
            if ($pos !== false) {
                return urldecode($path[$pos + 1]);
            }
            return '';
        }
    }


    /**
     * Return a parameter value of a query string.
     * @param $param Parameter to be get
     * @return string Parameter value
     */
    private static function getQueryParam($param, $query) {
        parse_str($query, $query_data);
        if (isset($query_data[$param])) {
            return $query_data[$param];
        }
        return '';
    }


    /**
     * Gets host informations.
     * @param string $host Host to be checked
     * @return array[string => string] Associative array with domain data.
     *     Potential keys are:
     *     - country
     *     - propose
     *     - main_domain
     *     - sub_domain
     */
    public static function parseHost($host) {
        $data = array();

        $country_domain = self::getCountryDomain();
        $propose_domain = self::getProposeDomain();

        $host_domains = explode('.', $host);

        $domain = array_pop($host_domains);
        if (in_array($domain, $country_domain)) {
            $data['country'] = $domain;
            $domain = array_pop($host_domains);
            if (in_array($domain, $propose_domain)) {
                $data['propose'] = $domain;
                $domain = array_pop($host_domains);
                $data['main_domain'] = $domain;
            } else {
                $data['main_domain'] = $domain;
            }
        } elseif (in_array($domain, $propose_domain)) {
            $data['propose'] = $domain;
            $domain = array_pop($host_domains);
            $data['main_domain'] = $domain;
        }
        if (!empty($host_domains)) {
            $data['sub_domain'] = implode('.', $host_domains);
        }
        return $data;
    }


    /**
     * Return an array of generic proposed domains
     * @return array[string]
     */
    public static function getProposeDomain() {
        return array(
            'aero', 'asia', 'biz', 'cat', 'co', 'com', 'coop', 'edu', 'gov', 'info', 'int',
            'jobs', 'mil', 'mobi', 'museum', 'name', 'net', 'org', 'pro', 'tel', 'travel'
        );
    }


    /**
     * Return an array of country domains
     * @return array[string]
     */
    public static function getCountryDomain() {
        return array(
            'ac', 'ad', 'ae', 'af', 'ag', 'ai', 'al', 'am', 'an', 'ao', 'aq', 'ar', 'as',
            'at', 'au', 'aw', 'ax', 'az', 'ba', 'bb', 'bd', 'be', 'bf', 'bg', 'bh', 'bi',
            'bj', 'bm', 'bn', 'bo', 'br', 'bs', 'bt', 'bv', 'bw', 'by', 'bz', 'ca', 'cc',
            'cd', 'cf', 'cg', 'ch', 'ci', 'ck', 'cl', 'cm', 'cn', 'co', 'cr', 'cu', 'cv',
            'cx', 'cy', 'cz', 'de', 'dj', 'dk', 'dm', 'do', 'dz', 'ec', 'ee', 'eg', 'er',
            'es', 'et', 'eu', 'fi', 'fj', 'fk', 'fm', 'fo', 'fr', 'ga', 'gb', 'gd', 'ge',
            'gf', 'gg', 'gh', 'gi', 'gl', 'gm', 'gn', 'gp', 'gq', 'gr', 'gs', 'gt', 'gu',
            'gw', 'gy', 'hk', 'hm', 'hn', 'hr', 'ht', 'hu', 'id', 'ie', 'il', 'im', 'in',
            'io', 'iq', 'ir', 'is', 'it', 'je', 'jm', 'jo', 'jp', 'ke', 'kg', 'kh', 'ki',
            'km', 'kn', 'kp', 'kr', 'kw', 'ky', 'kz', 'la', 'lb', 'lc', 'li', 'lk', 'lr',
            'ls', 'lt', 'lu', 'lv', 'ly', 'ma', 'mc', 'md', 'me', 'mg', 'mh', 'mk', 'ml',
            'mm', 'mn', 'mo', 'mp', 'mq', 'mr', 'ms', 'mt', 'mu', 'mv', 'mw', 'mx', 'my',
            'mz', 'na', 'nc', 'ne', 'nf', 'ng', 'ni', 'nl', 'no', 'np', 'nr', 'nu', 'nz',
            'om', 'pa', 'pe', 'pf', 'pg', 'ph', 'pk', 'pl', 'pm', 'pn', 'pr', 'ps', 'pt',
            'pw', 'py', 'qa', 're', 'ro', 'rs', 'ru', 'rw', 'sa', 'sb', 'sc', 'sd', 'se',
            'sg', 'sh', 'si', 'sj', 'sk', 'sl', 'sm', 'sn', 'so', 'sr', 'st', 'su', 'sv',
            'sy', 'sz', 'tc', 'td', 'tf', 'tg', 'th', 'tj', 'tk', 'tl', 'tm', 'tn', 'to',
            'tp', 'tr', 'tt', 'tv', 'tw', 'tz', 'ua', 'ug', 'uk', 'us', 'uy', 'uz', 'va',
            'vc', 've', 'vg', 'vi', 'vn', 'vu', 'wf', 'ws', 'ye', 'yt', 'za', 'zm', 'zw'
        );
    }
}

Exemplo de uso:

// Palavras usadas para chegar ao site corrente
$url = $_SERVER['HTTP_REFERER'];
$words = SearchWords::getWords($url);

// Palavras usadas no link
$url = 'http://www.google.com.br/#hl=pt-BR&source=hp&biw=1269&bih=567&q=Rubens+Takiguti+Ribeiro&aq=f&aqi=&aql=&oq=&gs_rfai=&fp=45e6118c2d7b8c30';
$words = SearchWords::getWords($url);

0 comentários