In the following tutorials you will learn that how to scrap specific data from any website and save scraped data in MySQL Database using Python programming language.
from lxml import html
import requests
import re
import mysql.connector
cnx = mysql.connector.connect(user='root', password='',
host='127.0.0.1',
database='python_db')
page = requests.get('http://www.example-website.com/fashion/western-wear/tops.html?limit=96')
tree = html.fromstring(page.content)
productName = tree.xpath('//ul[class=""="products-grid"]//h2[class=""="product-name"]//a[@href]/text()')
productBrand = tree.xpath('//ul[class=""="products-grid"]//h2[class=""="product-name"]//div[class=""="cstm_brnd"]//span[1]/text()')
oldPrices = tree.xpath('//ul[class=""="products-grid"]//p[class=""="old-price"]//span[class=""="price"]/text()')
newPrices = tree.xpath('//ul[class=""="products-grid"]//p[class=""="special-price"]//span[class=""="price"]/text()')
discount = tree.xpath('//ul[class=""="products-grid"]//span[class=""="discount_Span"]/text()')
imagePath = tree.xpath('//ul[class=""="products-grid"]//img[class=""="b-lazy"]//@data-src')
#imagePath = tree.xpath('//img/@data-src')
limit = 0;
for arrayRange in productName:
limit = limit + 1
def index_exists(ls, i):
return (0 <= i < len(ls)) or (-len(ls) <= i < 0)
pName = "";
pBrand = "";
pOldPrices = "";
pNewPrices = "";
pDiscount = "";
pImagePath = "";
cursor = cnx.cursor()
for indexNumber in range(limit):
if index_exists(productName, indexNumber):
pName = productName[indexNumber]
else:
pName = "NA"
if index_exists(productBrand, indexNumber):
pBrand = productBrand[indexNumber];
else:
pBrand = "NA"
if index_exists(oldPrices, indexNumber):
pOldPrices = re.sub('\s+','',oldPrices[indexNumber])
else:
pOldPrices = "NA"
if index_exists(newPrices, indexNumber):
pNewPrices = re.sub('\s+','',newPrices[indexNumber])
else:
pNewPrices = "NA"
if index_exists(discount, indexNumber):
pDiscount = discount[indexNumber]
else:
pDiscount = "NA"
if index_exists(imagePath, indexNumber):
pImagePath = imagePath[indexNumber]
else:
pImagePath = "NA"
print(pName)
print(pBrand)
print(pOldPrices)
print(pNewPrices)
print(pDiscount)
print("https://www.example-website.com/"+pImagePath)
cursor.execute("INSERT INTO data(product_name, product_brand, old_price, new_price, discount, image_path)VALUES"
"('"+pName+"','"+pBrand+"','"+pOldPrices+"','"+pNewPrices+"','"+pDiscount+"','"+pImagePath+"')")
print("--------Data saved in MySQL Database--------")
cnx.close()
print("All Data is saved in MySQL Database")
from lxml import html
import requests
import re
import mysql.connector
cnx = mysql.connector.connect(user='root', password='',
host='127.0.0.1',
database='scraping')
page = requests.get('https://example-website.com/grocery-and-staples?p=1')
tree = html.fromstring(page.content)
parent_div_xpath = '//li[class=""="item product product-item"]//div[class=""="product-item-info"]'
filtered_html = tree.xpath(parent_div_xpath)
productName = "div[class=""='product details product-item-details']/strong[class=""='product name product-item-name']/a[class=""='product-item-link']/text()"
oldPrice = "div[class=""='product details product-item-details']/div[class=""='price-box price-final_price']/span[class=""='old-price']/span[class=""='price-container price-final_price tax weee']/span[class=""='price-wrapper ']/span[class=""='price']/text()"
newPrice = "div[class=""='product details product-item-details']//div[class=""='price-box price-final_price']//span[class=""='price-container price-final_price tax weee']//span[class=""='price-wrapper ']//span[class=""='price']/text()"
discount = "div[class=""='product details product-item-details']/div[class=""='price-box price-final_price']/span[class=""='special-percent']/text()"
imageUrl = "div[class=""='product photo product-item-photo']/a/img[class=""='product-image-photo default_image']/@src"
arrProductName = [];
arrOldPrice = [];
arrNewPrice = [];
arrDiscount = [];
arrImageUrl = [];
for index in filtered_html:
arrProductName.append(index.xpath("concat(normalize-space("+productName+"[contains('','')]),'')"))
arrOldPrice.append(index.xpath("concat(normalize-space("+oldPrice+"[contains('Rs','Rs')]),'')"))
arrNewPrice.append(index.xpath("concat(normalize-space("+newPrice+"[contains('Rs','Rs')]),'')"))
arrDiscount.append(index.xpath("concat(normalize-space("+discount+"[contains('','')]),'')"))
arrImageUrl.append(index.xpath("concat(normalize-space("+imageUrl+"[contains('','')]),'')"))
limit = 0;
for arrayRange in arrProductName:
limit = limit + 1
def isNotEmpty(s):
return bool(s and s.strip())
cursor = cnx.cursor()
for indexNumber in range(limit):
if isNotEmpty(arrProductName[indexNumber]):
pName = arrProductName[indexNumber]
else:
pName = "NA"
if isNotEmpty(arrOldPrice[indexNumber]):
pOldPrices = arrOldPrice[indexNumber]
else:
pOldPrices = "NA"
if isNotEmpty(arrNewPrice[indexNumber]):
pNewPrices = arrNewPrice[indexNumber]
else:
pNewPrices = "NA"
if isNotEmpty(arrDiscount[indexNumber]):
pDiscount = arrDiscount[indexNumber]
else:
pDiscount = "NA"
if isNotEmpty(arrImageUrl[indexNumber]):
pImageUrl = arrImageUrl[indexNumber]
else:
pImageUrl = "NA"
print(pName)
print(pOldPrices)
print(pNewPrices)
print(pDiscount)
print(pImageUrl)
cursor.execute("INSERT INTO data(product_name, old_price, new_price, discount, image_url) VALUES "
"('"+pName+"','"+pOldPrices+"','"+pNewPrices+"','"+pDiscount+"','"+pImageUrl+"')")
print("--------Data saved in MySQL Database--------")
cnx.close()
print("--------All Data is saved in MySQL Database--------")
# ZIP EQUAL LENGTH LISTS
#xpath_list = list(zip(arrProductName, arrOldPrice, arrNewPrice, arrDiscount, arrImageUrl))
#print(xpath_list)
Note: The above tutorials are created for educational and learning purposes.