Scraping data from multiple html tables within one website in python -
i trying timeseries website python: http://www.boerse-frankfurt.de/en/etfs/db+x+trackers+msci+world+information+technology+trn+index+ucits+etf+lu0540980496/price+turnover+history/historical+data#page=1
i've gotten pretty far, don't know how data , not first 50 rows can see on page. view them online, have click through results @ bottom of table. able specify start , end date in python , corresponding dates , prices in list. here have far:
bs4 import beautifulsoup import requests import lxml import re url = 'http://www.boerse-frankfurt.de/en/etfs/db+x+trackers+msci+world+information+technology+trn+index+ucits+etf+lu0540980496/price+turnover+history/historical+data' soup = beautifulsoup(requests.get(url).text) dates = soup.findall('td', class_='column-date') dates = [re.sub('[\\nt\s]','',d.string) d in dates] prices = soup.findall('td', class_='column-price') prices = [re.sub('[\\nt\s]','',p.string) p in prices]
you need loop through rest of pages. can use post request that. server expects receive structure in each post request. structure defined below in values. page number parameter 'page' of structure. structure has several parameters have not tested interesting try, items_per_page, max_time , min_time. here below example code:
from bs4 import beautifulsoup import urllib import urllib2 import re url = 'http://www.boerse-frankfurt.de/en/parts/boxes/history/_histdata_full.m' values = {'component_id':'preeb7da7a4f4654f818494b6189b755e76', 'ag':'103708549', 'boerse_id': '12', 'include_url': '/parts/boxes/history/_histdata_full.m', 'item_count': '96', 'items_per_page': '50', 'lang': 'en', 'link_id': '', 'max_time': '2014-09-20', 'min_time': '2014-05-09', 'page': 1, 'page_size': '50', 'pages_total': '2', 'secu': '103708549', 'template': '0', 'titel': '', 'title': '', 'title_link': '', 'use_external_secu': '1'} dates = [] prices = [] while true: data = urllib.urlencode(values) request = urllib.urlopen(url, data) soup = beautifulsoup(request.read()) temp_dates = soup.findall('td', class_='column-date') temp_dates = [re.sub('[\\nt\s]','',d.string) d in temp_dates] temp_prices = soup.findall('td', class_='column-price') temp_prices = [re.sub('[\\nt\s]','',p.string) p in temp_prices] if not temp_prices: break else: dates = dates + temp_dates prices = prices + temp_prices values['page'] += 1
Comments
Post a Comment