Download papers from CSH

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import re
import requests
from bs4 import BeautifulSoup


p53url = "http://cshperspectives.cshlp.org/site/misc/the_p53_family.xhtml"
p53html = requests.get(p53url)

soup = BeautifulSoup(p53html.text)
link = soup.find(id='subj_coll').findAll('a')

prefix = 'http://cshperspectives.cshlp.org/content'
tail = '.full.pdf'
links = []
titles = []
n = 1
for item in link:
    path = item.get('href')
    uri = re.sub('.*short', '', path)
    url = ''.join([prefix, uri, tail])
    links.append(url)

    title = str(n) + '.' + item.text + '.pdf'
    if ':' in title:
        title = re.sub(":", "--", title)
    if '/' in title:
        title = re.sub("/", "-", title)
    title = 'p53/' + title
    titles.append(title)
    n += 1


for i, j in zip(links, titles):
    print 'Downloading', j
    f = requests.get(i)
    with open(j, 'wb') as code:
        code.write(f.content)
    print j, 'Done!\n\n'
2014-05-17 15:3821
comments powered by Disqus