from lxml import html
import requests
import re

page = requests.get('https://example-website.com/grocery?1=1&page=1')
tree = html.fromstring(page.content)

parent_div_xpath = '//li[class=""="productPreview"]//div[class=""="productPriceContainer productPriceSmall"]//span'
title_filter = '//span[class=""="productPromoPrice"]/text()'
copy_filter = '//span[class=""="productOriginalPrice"]/text()'
filtered_html = tree.xpath(parent_div_xpath)

acc = [];
twitch = [];
lastOnline = [];

for i in tree.xpath(parent_div_xpath):
    acc.append(i.xpath("concat(normalize-space(span[class=""='productPromoPrice']/text()[contains('Rs','Rs')]),'')"))
    twitch.append(i.xpath("concat(normalize-space(span[class=""='productOriginalPrice']/text()[contains('Rs','Rs')]),'')"))
    lastOnline.append(i.xpath("concat(../@data-time, '')"))

# ZIP EQUAL LENGTH LISTS
xpath_list = list(zip(acc, twitch, lastOnline))
mean_data = np.array(mean_data)
for abc in xpath_list:
    print(xpath_list[abc])

Output

199 280 3 hrs ago
499 710 5 hrs ago

Scraping Data from Website another Example

Program


from lxml import html
import requests
import re

page = requests.get('https://example-website.com/grocery-and-staples?p=1')
tree = html.fromstring(page.content)

parent_div_xpath = '//li[class=""="item product product-item"]//div[class=""="product-item-info"]'
filtered_html = tree.xpath(parent_div_xpath)

productName = "div[class=""='product details product-item-details']/strong[class=""='product name product-item-name']/a[class=""='product-item-link']/text()"
oldPrice = "div[class=""='product details product-item-details']/div[class=""='price-box price-final_price']/span[class=""='old-price']/span[class=""='price-container price-final_price tax weee']/span[class=""='price-wrapper ']/span[class=""='price']/text()"
newPrice = "div[class=""='product details product-item-details']//div[class=""='price-box price-final_price']//span[class=""='price-container price-final_price tax weee']//span[class=""='price-wrapper ']//span[class=""='price']/text()"
discount = "div[class=""='product details product-item-details']/div[class=""='price-box price-final_price']/span[class=""='special-percent']/text()"
imageUrl = "div[class=""='product photo product-item-photo']/a/img[class=""='product-image-photo default_image']/@src"

arrProductName = [];
arrOldPrice = [];
arrNewPrice = [];
arrDiscount = [];
arrImageUrl = [];

for index in filtered_html:
    arrProductName.append(index.xpath("concat(normalize-space("+productName+"[contains('','')]),'')"))
    arrOldPrice.append(index.xpath("concat(normalize-space("+oldPrice+"[contains('Rs','Rs')]),'')"))
    arrNewPrice.append(index.xpath("concat(normalize-space("+newPrice+"[contains('Rs','Rs')]),'')"))
    arrDiscount.append(index.xpath("concat(normalize-space("+discount+"[contains('','')]),'')"))
    arrImageUrl.append(index.xpath("concat(normalize-space("+imageUrl+"[contains('','')]),'')"))

#for i in range(0,50):
#    print(arrProductName[i])
#    print(arrOldPrice[i])
#    print(arrNewPrice[i])
#    print(arrDiscount[i])
#    print(arrImageUrl[i])

# ZIP EQUAL LENGTH LISTS
xpath_list = list(zip(arrProductName, arrOldPrice, arrNewPrice, arrDiscount, arrImageUrl))
print(xpath_list)

Output

product name ...   product old price ...   product new price ...   product discount ...   product image URL ...
product name ...   product old price ...   product new price ...   product discount ...   product image URL ...
product name ...   product old price ...   product new price ...   product discount ...   product image URL ...

Note: The above tutorials are created for educational and learning purposes.

Previous Next