“丝路通:分布式爬虫任务分配”的版本间的差异
来自CloudWiki
第175行: | 第175行: | ||
print ("Database version : %s " % data) | print ("Database version : %s " % data) | ||
+ | |||
+ | # 关闭数据库连接 | ||
+ | db.close()</nowiki> | ||
+ | |||
+ | ===查询数据库=== | ||
+ | <nowiki>import pymysql | ||
+ | |||
+ | |||
+ | # 打开数据库连接 | ||
+ | db = pymysql.connect("10.0.0.30","root","000000","crawler" )# 使用cursor()方法获取操作游标 | ||
+ | cursor = db.cursor() | ||
+ | |||
+ | # SQL 查询语句 | ||
+ | sql = "SELECT * FROM task \ | ||
+ | WHERE 1" | ||
+ | try: | ||
+ | # 执行SQL语句 | ||
+ | cursor.execute(sql) | ||
+ | # 获取所有记录列表 | ||
+ | results = cursor.fetchall() | ||
+ | for row in results: | ||
+ | print(row) | ||
+ | except: | ||
+ | print ("Error: unable to fetch data") | ||
# 关闭数据库连接 | # 关闭数据库连接 | ||
db.close()</nowiki> | db.close()</nowiki> |
2020年9月17日 (四) 15:13的版本
任务切割
将原始的待爬目录表 分割成许多小份,当作许多小任务去完成。
敦煌网
import time task_header ='../../task/dh_task/dh_task_' #header def assign_task(): task_content = "" # 创建类别网址列表 fp = open('dh_sub_category.csv', "rt") # 打开csv文件 count= 0 num =0 #类别名 类目级别 父类目级别 s =set()#储存已有的类别 for line in fp: # 文件对象可以直接迭代 count +=1 task_content +=line if count%100 ==0: num += 1 fw = open(task_header+str(num)+".csv","w",encoding="utf-8") fw.write(task_content) fw.close() task_content ="" fw = open(task_header+str(num)+".csv","a",encoding="utf-8") fw.write(task_content) fw.close() task_content ="" fp.close() if __name__ == '__main__': assign_task()
阿里巴巴
import time task_header ='../../task/ali_task/ali_task_' #header def assign_task(): task_content = "" # 创建类别网址列表 fp = open('alibaba_categary.csv', "rt") # 打开csv文件 count= 0 num =0 #类别名 类目级别 父类目级别 for line in fp: # 文件对象可以直接迭代 count +=1 task_content +=line if count%100 == 0: num += 1 fw = open(task_header+str(num)+".csv","w",encoding="utf-8") fw.write(task_content) fw.close() task_content ="" if num <= 50: fw = open(task_header+str(num)+".csv","a",encoding="utf-8") else: fw = open(task_header+str(num+1)+".csv","a",encoding="utf-8") fw.write(task_content) fw.close() task_content ="" fp.close() if __name__ == '__main__': assign_task()
中国制造网
import time task_header ='../../task/mc_task/mc_task_' #header def assign_task(): task_content = "" # 创建类别网址列表 fp = open('made_in_china_sub_cat.csv', "rt") # 打开csv文件 count= 0 num =0 for line in fp: # 文件对象可以直接迭代 count +=1 task_content +=line if count%200 ==0: num += 1 fw = open(task_header+str(num)+".csv","w",encoding="utf-8") fw.write(task_content) fw.close() task_content ="" if num <= 100: fw = open(task_header+str(num)+".csv","a",encoding="utf-8") else: fw = open(task_header+str(num+1)+".csv","a",encoding="utf-8") fw.write(task_content) fw.close() task_content ="" fp.close() if __name__ == '__main__': assign_task()
任务表建立
安装mysql
- Centos7 安装python3,本项目安装python3.6
- Centos7 安装MySQL
建立数据表
CREATE DATABASE crawler; MariaDB [crawler]> CREATE TABLE IF NOT EXISTS `task`( -> `id` INT UNSIGNED AUTO_INCREMENT, -> `site_title` VARCHAR(100) NOT NULL, -> `task_name` VARCHAR(40) NOT NULL, -> `task_status` INT UNSIGNED NOT NULL, -> `availability_zones` VARCHAR(60) NOT NULL, -> `start_date` DATE, -> PRIMARY KEY ( `id` ))ENGINE=InnoDB DEFAULT CHARSET=utf8; Query OK, 0 rows affected (0.01 sec) MariaDB [crawler]> INSERT INTO task(site_title,task_name,task_status,availability_zones,start_date) VALUES('alibaba','ali_task_1','0','A','20200609'); Query OK, 1 row affected (0.00 sec) MariaDB [crawler]> select * from task; +----+------------+------------+-------------+--------------------+------------+ | id | site_title | task_name | task_status | availability_zones | start_date | +----+------------+------------+-------------+--------------------+------------+ | 1 | alibaba | ali_task_1 | 0 | A | 2020-06-09 | +----+------------+------------+-------------+--------------------+------------+
关闭防火墙
systemctl disable firewalld systemctl stop firewalld
在公网上须设置防火墙,不能一关了知
setenforce 0
数据库连接
import pymysql # 打开数据库连接 db = pymysql.connect("10.0.0.30","root","000000","crawler" ) #这4个参数依次是:主机名,用户名,密码和数据库名 # 使用 cursor() 方法创建一个游标对象 cursor cursor = db.cursor() # 使用 execute() 方法执行 SQL 查询 cursor.execute("SELECT VERSION()") # 使用 fetchone() 方法获取单条数据. data = cursor.fetchone() print ("Database version : %s " % data) # 关闭数据库连接 db.close()
查询数据库
import pymysql # 打开数据库连接 db = pymysql.connect("10.0.0.30","root","000000","crawler" )# 使用cursor()方法获取操作游标 cursor = db.cursor() # SQL 查询语句 sql = "SELECT * FROM task \ WHERE 1" try: # 执行SQL语句 cursor.execute(sql) # 获取所有记录列表 results = cursor.fetchall() for row in results: print(row) except: print ("Error: unable to fetch data") # 关闭数据库连接 db.close()