import requests
from bs4 import BeautifulSoup
IMGFOLDER = 'fruitimages/'
def run():
for (idx, page) in enumerate(range(380)):
resp = requests.get(
'https://usdawatercolors.nal.usda.gov/pom/search.xhtml?start={}&searchText=&searchField=&sortField='.format(
idx 20))
soup = BeautifulSoup(resp.text, 'html.parser')
for (dividx, div) in enumerate(soup.select('div.document')):
doc = div.selectone('dl.defList')
artist = doc.selectone(':nth-child(2)>a').gettext()
year = doc.selectone(':nth-child(4)>a').gettext()
scientificname = 'none' if doc.selectone(':nth-child(6)>a') is None else doc.selectone(
':nth-child(6)>a').gettext()
commonname = 'none' if doc.selectone(':nth-child(8)>a') is None else doc.selectone(
':nth-child(8)>a').gettext()
thumbpicsrc = div.selectone('div.thumb-frame>a>img')['src']
id = (idx + 1) 20 + dividx + 1
info = FruitInfo(id, artist, year, scientificname, commonname, thumbpicsrc)
print(info)
info.downloadandsave()
class FruitInfo:
def init(self, id, artist, year, scientificname, commonname, thumbpicurl):
self.id = id
self.artist = artist
self.year = year
self.scientificname = scientificname
self.commonname = commonname
self.thumbpicurl = thumbpicurl
def downloadandsave(self):
filename = '{}-{}-{}-{}.png'.format(self.id, self.commonname, self.year, self.artist).replace(' ', '')
print('filename = ', filename)
oriimgurl = self._parseoriimgurl()
print('original img url = ', oriimgurl)
resp = requests.get(oriimgurl)
with open(IMGFOLDER + filename, 'wb') as f:
f.write(resp.content)
print('saved...', filename)
def parseoriimgurl(self) -> str:
imgid = self.thumbpicurl.split('/')[2]
print('img id = ', imgid)
return 'https://usdawatercolors.nal.usda.gov/download/{}/screen'.format(imgid)
def str(self):
return 'FruitInfo(artist={},year={},scientificname={},commonname={},thumbpicurl={})'.format(self.artist,
self.year,
self.scientificname,
self.commonname,
self.thumbpic_url)
if name == 'main':
run()
评论