爬虫:php实现 百度首页书签的获取,以及百度首页书签的迁移
1,获取旧账户的cookie,token等数据,从浏览器审查元素即可
点击添加分类 即可获取到cookie token等数据
2,利用正则获取书签信息数据(获取旧账户的书签信息,并为新账户创建书签分类,并存储分类id,curl:为自己实现的类)
public function actionCreatedir(){
$redis=Yii::$app->redis;
$curl=new curl();
$curl->setOption(CURLOPT_COOKIE,'BxxxxxxxxxxxxxxxAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAJcur1uXLq9bW; BD_HOME=1; H_PS_PSSID=1423_21088_18559_26350; BD_UPN=12314353');//旧账户的cookie
$curl->setOption(CURLOPT_REFERER,'https://www.baidu.com/');
$url='http://www.baidu.com/';
$curl->close=false;
$curl->setUrl($url);
$data=$curl->get();
$pattern="/<span class=name-text.*>((?!<).*)<\/span>.*(<div.*dir-content.*del-dir)/isU";
preg_match_all($pattern,$data,$matchs);
$result=[];
$pattern="/<a.*title=\"(.*)\".*href=\"(.*)\"/isU";
$create_cate_url='https://www.baidu.com/home/subscribe/submit/manoperation';
$newcookie=<<<cookie
BAIDUID=F6xxxxxxxxxxxxxxxxxx
cookie;//新账户的cookie
$curl->setOption(CURLOPT_COOKIE,$newcookie);
$curl->setUrl($create_cate_url);
foreach ($matchs[2] as $key=>$match) {
$need=[];
preg_match_all($pattern,$match,$item);
foreach ($item[1] as $i_key=>$val){
$need[]=$item[1][$i_key].";".$item[2][$i_key];
}
$result[$matchs[1][$key]]=$need;
}
$error=[];
foreach ($result as $key=>&$item) {
$curl->setOption(CURLOPT_COOKIE,$newcookie);
$curl->setUrl($create_cate_url);
$data=[
'cmd'=>'add_dir',
'dirName'=>$key,
'tabid'=>1,
'indextype'=>'manht',
'bsToken'=>'53887830be8b71f61233282aaff9a7d4bfb',
'_req_seqid'=>'0xc9e6ddb612300040933',
'sid'=>'1433_21079123_26350_20928',
];
$return=$curl->post($data,'build');
$return=json_decode($return,true);
if($return['errNo']!=0){
$error[$key]=$item;
$redis->set('dir_error',json_encode($error,320));
}else{
$dir_id=$return['data']['dirId'];
$item['dir_id']=$dir_id;
}
}
$redis->set('dir_create',json_encode($result,320));
}
3,创建具体的书签(上一步包含,书签目录,此时遍历创建即可)
public function actionCreateitem()
{
$data=Yii::$app->request->post();
extract($data);
$redis = Yii::$app->redis;
$bookmark = json_decode($redis->get('dir_create'), true);
// var_dump($bookmark);die();
$cookie=$cookie??<<<cookie
BAIDUID=F6D138xxxxxxxxxxxxxS_PSSID=1433_21079_26350_20928; BD_UPN=12314353
cookie;//新账户的cookie
$curl=new curl();
$curl->setOption(CURLOPT_COOKIE,$cookie);
$curl->close=false;
foreach ($bookmark as $item) {
if (isset($item['dir_id'])) {
foreach ($item as $key => $i) {
if (is_numeric($key)) {
$i_array = explode(';', $i);
$name = $i_array[0];
$url = $i_array[1];
$data['cmd'] = 'add';
$data['from'] = 'u_layer';
$data['name'] = $name;
$data['url'] = $url;
$data['customDirId'] = $item['dir_id'];
$data['tabid']='1';
$data['indextype']='manht';
$data['_req_seqid'] = $qid??'0x9d28b81xxxxxxxxx00011e6a';
$data['bsToken']=$token??'53887830bxxxxxxxxx282aaff9a7d4bfb';
$data['sid'] = $sid??'1433_xxxxxxx0_20928';
self::create_item($data, $curl);
}
}
}
}
}
public static function create_item($data,curl $curl){
$url='https://www.baidu.com/home/subscribe/submit/manoperation';
$curl->setUrl($url);
$result=$curl->post($data,'build');
}
效果图:
旧账户的书签图
新账户书签图:(未分类的书签没有获取到数据)