.*?(.*?).*?.*?(.*?)
~is';
preg_match_all($pattern, $results['content'], $matches);
// $matches[1] = title
// $matches[2] = URL
// $matches[3] = description
// Use tab to separate columns, as commas are often used within descriptions. Therefore using commas
// for column delimitation would break very easily.
if ($matches) {
echo "title\tURL\tdescription\n";
foreach ($matches[2] as $index => $resultItem)
{
echo strip_tags($resultItem) . "\t";
echo strip_tags($matches[1][$index]) . "\t";
echo strip_tags($matches[3][$index]) . "\t";
echo "\n";
}
}
/**
* Generic function for data and header from a URL.
* @param $url The URL to fetch
* @param $timeout The time to way if things go wrong.
* @return Array containing header and page content.
*/
function get_url($url, $timeout = 5)
{
$url = str_replace( "&", "&", urldecode(trim($url)) );
$cookie = tempnam ("/tmp", "CURLCOOKIE");
$ch = curl_init();
curl_setopt( $ch, CURLOPT_USERAGENT, USER_AGENT);
curl_setopt( $ch, CURLOPT_URL, $url );
curl_setopt( $ch, CURLOPT_COOKIEJAR, $cookie );
curl_setopt( $ch, CURLOPT_FOLLOWLOCATION, true );
curl_setopt( $ch, CURLOPT_ENCODING, "" );
curl_setopt( $ch, CURLOPT_RETURNTRANSFER, true );
curl_setopt( $ch, CURLOPT_AUTOREFERER, true );
curl_setopt( $ch, CURLOPT_SSL_VERIFYPEER, false ); # required for https urls
curl_setopt( $ch, CURLOPT_CONNECTTIMEOUT, $timeout );
curl_setopt( $ch, CURLOPT_TIMEOUT, $timeout );
curl_setopt( $ch, CURLOPT_MAXREDIRS, 10 );
$content = curl_exec( $ch );
$response = curl_getinfo( $ch );
curl_close ( $ch );
if ($response['http_code'] == 301 || $response['http_code'] == 302)
{
ini_set("user_agent", USER_AGENT);
if ( $headers = get_headers($response['url']) )
{
foreach( $headers as $value )
{
if ( substr( strtolower($value), 0, 9 ) == "location:" )
return get_url( trim( substr( $value, 9, strlen($value) ) ) );
}
}
}
return array( 'content' => $content, 'header' => $response );
}
?>