页面的抓取并保存

简单实验 飞快学 450浏览

下面的代码实现了从新闻页面 http://www.chinanews.com/gj/2018/01-03/8415037.shtml 抓取信息并保存到数据库的功能。

初步实现功能

<?php
    require_once '../simplehtmldom_1_5/simple_html_dom.php'; 
    require_once 'db_sakila.php'; 

    header("Content-type:text/html;charset=utf-8");
    $url = "http://www.chinanews.com/gj/2018/01-03/8415037.shtml";
    $html = new simple_html_dom();
    $html->load(file_get_contents($url));
    $title = $html->find('#cont_1_1_2  h1', 0)->plaintext;
    $content = $html->find('.left_zw', 0)->outertext;

    $post = new Post();
    $post->title = $title;
    $post->content = $content;
    $post->pub_time = "2018-01-03 10:35";
    $post->url = $url;
    $post->priority = 0;
    $post->save();
?>

<!DOCTYPE html>
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<title>PHP So Easy 首页文章</title>
</head>
<body>
<h1><?=  $title ?></h1>
<?=  $content ?>
</body>
</html>

优化:改写为函数

<?php

require_once '../simplehtmldom_1_5/simple_html_dom.php'; 
require_once 'db_sakila.php'; 

function get_news_to_db($url)
{
    $options = array('url' => $url);
    if (Post::all($options)) return false;

    $html = new simple_html_dom();
    $html->load(file_get_contents($url));
    $title = $html->find('#cont_1_1_2  h1', 0)->plaintext;
    $content = $html->find('.left_zw', 0)->outertext;
    $post = new Post();
    $post->title = $title;
    $post->content = $content;
    $post->pub_time = "2018-01-03 10:35";
    $post->url = $url;
    $post->priority = 0;
    $post->save();
    return true;
}

// $url = 'http://www.chinanews.com/gj/2018/01-03/8415037.shtml';
$url = 'http://www.chinanews.com/gj/2018/01-03/8415240.shtml';
get_news_to_db($url);
?>

从列表中抓取超链接

$root_url = 'http://www.chinanews.com/world.shtml';
$html = new simple_html_dom();
$html->load(file_get_contents($root_url));
$urls = $html->find('.content_list .dd_bt a');
foreach ($urls as $url)
{
    echo $url->href."<br>";
}

链接优化

$root_url = 'http://www.chinanews.com/world.shtml';
$base_url = 'http://www.chinanews.com/';
$html = new simple_html_dom();
$html->load(file_get_contents($root_url));
$urls = $html->find('.content_list .dd_bt a');
foreach ($urls as $url)
{
    if(strpos($url->href, $base_url) !== false)
        echo $url->href."<br>";
    else
        echo $base_url.$url->href."<br>";
}