require 'open-uri'
ARRAY_LIMIT = 10000
urls = ['http://www.baidu.com']
while true
url = urls.first
urls.shift 1
begin
open(url) do |page|
page_content = page.read();
next if urls.length > ARRAY_LIMIT
links = page_content.scan(/<a href=\"(.*?)\"/).flatten
links.each do |l|
if l[0,4] == 'http'
urls << l
elsif l[0,1] =='.'
urls << url+l[1,l.length]
elsif l[0,1] == '/'
urls << url[/http:\/\/[^\/]+/]+l ;
end
end
urls.uniq!
end
rescue
print "超时\n";
next
end
print(urls.length);
print "\n";
end
目的也很简单,就是想给一个网址,按网址上的链接依次往下爬,尽可能的多爬取。 所以就写了上面的简单的代码。 想在里面加入多进程或多线程,要怎么做?