Alexa website list is provided in the form of an electronic table. There are two contents in the table, namely ranking and domain name
– | A | B |
---|---|---|
1 | 1 | google.com |
2 | 2 | facebook.com |
3 | 3 | youtube.com |
… | … | … |
Draw the data to include the following four steps:
1. Download zip file
2. Extract CSV file from ZIP file A Zhong
3. Analyze CSV file
4. Each line of traversal in the CSV file, extract data from it
The following is the code to implement the above function:
Among them, the content of the downloader:
#-*- coding=utf-8 -*-
import re
import urlparse
import urllib2
import time
from datetime import datetime
import robotparser
import Queue
from bs4 import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import csv
import lxml.html
import random
import cssselect
import socket
DEFAULT_AGENT='wswp'
DEFAULT_DELAY=5
DEFAULT_RETRIES=1
DEFAULT_TIMEOUT=60
class Throttle:
"""Throttle downloading by sleeping between requests to same domain
"""
def __init__(self, delay):
# amount of delay between downloads for each domain
self.delay = delay
# timestamp of when a domain was last accessed
self.domains = {}
def wait(self, url):
domain = urlparse.urlparse(url).netloc
last_accessed = self.domains.get(domain)
if self.delay > 0 and last_accessed is not None:
sleep_secs = self.delay - (datetime.now() - last_accessed).seconds
if sleep_secs > 0:
time.sleep(sleep_secs)
self.domains[domain] = datetime.now()
class Downloader:
def __init__(self,delay=5,user_agent='wswp',proxies=None,num_retries=1,timeout=60,opener=None,cache=None):
socket.setdefaulttimeout(timeout)
self.throttle=Throttle(delay)
self.user_agent=user_agent
self.proxies=proxies
self.num_retries=num_retries
self.opener=opener
self.cache = cache
def __call__(self,url):
result=None
if self.cache:
try:
result=self.cache[url]
except KeyError:
# URL is not available in the cache
pass
#else:
#if result is not None and self.num_retries >0 and 500<=result['code']<600:
# I encountered the fault of the server and downloaded it again
# result=None
if result==None:
# result is not in cache
# So still need to be downloaded again
self.throttle.wait(url)
proxy=random.choice(self.proxies) if self.proxies else None
headers={
'user_agent':self.user_agent}
result=self.download(url,headers,proxy,self.num_retries)
if self.cache:
# Save results Enter Cache
self.cache[url]=result
return result['html']
def download(self,url,headers,proxy,num_retries,data=None):
print 'Downlaoding:',url
request=urllib2.Request(url,data,headers or{})
opener=self.opener or urllib2.build_opener()
if proxy:
proxy_params={urlparse.urlparse(url).scheme:proxy}
opener.add_handler(urllib2.ProxyHandler(proxy_params))
try:
response=opener.open(request)
html=response.read()
code=response.code
except Exception as e:
print 'Download error:',str(e)
html=''
if hasattr(e,'code'):
code=e.code
if num_retries>0 and 500<=code<600:
return self.download(url,headers,proxy,num_retries-1,data)
else:
code=None
return {
'html':html,'code':code}
#-*- coding=utf-8 -*-
import csv
from zipfile import ZipFile
from StringIO import StringIO
from downloader import Downloader
D=Downloader()
zipped_data=D('http://s3.amazonaws.com/alexa-static/top-1m.csv.zip')
urls=[]
with ZipFile(StringIO(zipped_data))as zf:
csv_filename=zf.namelist()[0]
for _,website in csv.reader(zf.open(csv_filename)):
urls.append('http://'+website)
may have noticed that the downloaded compressed data lion is passed to ZIPFILE after being packaged with Stringio. This is because Zipfile needs an interface of similar files, not a string. Next, we extract the list of files from the compression file. Since this .zip file contains only one file, we can directly select the first file. The CSV file is then traversed to add the domain name data in the second column to the URL list.