爬取不同分辨率下的不同地图图片数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
from math import *
import urllib
import urllib2
import requests
import os
url_list=[]
# 生成url,如/8/0/0.png,8/0/1.png.../8/0/255.png.../8/255/255.png
def create_url(first,second):
for y in range(int(pow(2,second))):
for z in range(int(pow(2,second))):
url_list.append(str(first)+'/'+str(y)+'/'+str(z)+'.png')
print str(first)+'/'+str(y)+'/'+str(z)+'.png'
return url_list

# 生成url对应的目录
def create_dirs(url_list,base_filepath):
for x in url_list:
x = x.split('/')
file_path = base_filepath + str(x[0]) + '/' + str(x[1])+'/'
if not os.path.exists(file_path):
print file_path

os.makedirs(file_path)


base_url = 'http://a.tile.openstreetmap.org/'
# 图片下载
def download_png(url_list,filepath):
for x in url_list:
url = base_url + x
urllib.urlretrieve(url, filename='d:/test/'+x)
# data = f.read()
# with open(filepath + x, "wb+") as code:
# code.write(data)
url_list=create_url(8,8)
download_png(url_list,'d:/test/')
#create_dirs(url_list,'d:/test/')

用法

  • 先修改文件路径,分辨率等参数
  • 然后先注释掉download_png函数调用,先调用create_dirs函数创建目录,然后取消注释开始下载图片

采用多进程爬取,并处理网络带来的IOError

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
from math import *
import urllib
import urllib2
import requests
import os
from exceptions import IOError
import logging
import logging

logging.basicConfig(level=logging.WARNING,
format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
datefmt='%a, %d %b %Y %H:%M:%S',
filename='myapp.log',
filemode='w')

url_list=[]
filepath = 'd:/test/'
base_url = 'http://a.tile.openstreetmap.org/'

def create_url(start,rate):
for y in range(start,int(pow(2,rate))):
for z in range(int(pow(2,rate))):
url_list.append(str(rate)+'/'+str(y)+'/'+str(z)+'.png')
logging.warning(str(rate)+'/'+str(y)+'/'+str(z)+'.png')
return url_list

def create_dirs(url_list,base_filepath):
for x in url_list:
x = x.split('/')
file_path = base_filepath + str(x[0]) + '/' + str(x[1])+'/'
if not os.path.exists(file_path):
logging.warning(file_path)
os.makedirs(file_path)


def download_png(url_list,filepath):
for x in url_list:
try:
url = base_url + x
print url
logging.warning(url)
urllib.urlretrieve(url, filename=filepath+x)
except IOError as serr:
logging.error(serr)
time.sleep(180)
urllib.urlretrieve(url, filename=filepath+x)

import multiprocessing
import time

def worker_1(start,rate):
url_list = create_url(start, rate)
create_dirs(url_list, filepath)
download_png(url_list, filepath)



def worker_2(start,rate):
url_list = create_url(start, rate)
create_dirs(url_list, filepath)
download_png(url_list, filepath)

def worker_3(start,rate):
url_list = create_url(start, rate)
create_dirs(url_list, filepath)
download_png(url_list, filepath)

def worker_4(start,rate):
url_list = create_url(start, rate)
create_dirs(url_list, filepath)
download_png(url_list, filepath)

def worker_5(start,rate):
url_list = create_url(start, rate)
create_dirs(url_list, filepath)
download_png(url_list, filepath)

def worker_5(start,rate):
url_list = create_url(start, rate)
create_dirs(url_list, filepath)
download_png(url_list, filepath)

if __name__ == "__main__":
p1 = multiprocessing.Process(target = worker_1, args = (630,10))
p2 = multiprocessing.Process(target = worker_2, args = (700,10))
p3 = multiprocessing.Process(target = worker_3, args = (800,10))
p4 = multiprocessing.Process(target = worker_4, args = (900, 10))
p5 = multiprocessing.Process(target = worker_5, args = (1000, 10))


p1.start()
p2.start()
p3.start()
p4.start()
p5.start()