Scraping Data and Saving in MySQL in Python

In the following tutorials you will learn that how to scrap specific data from any website and save scraped data in MySQL Database using Python programming language.

Scraping Data and Saving in MySQL Database

Program

from lxml import html
import requests
import re
import mysql.connector

cnx = mysql.connector.connect(user='root', password='',
                              host='127.0.0.1',
                              database='python_db')

page = requests.get('http://www.example-website.com/fashion/western-wear/tops.html?limit=96')
tree = html.fromstring(page.content)

productName = tree.xpath('//ul[class=""="products-grid"]//h2[class=""="product-name"]//a[@href]/text()')
productBrand = tree.xpath('//ul[class=""="products-grid"]//h2[class=""="product-name"]//div[class=""="cstm_brnd"]//span[1]/text()')
oldPrices = tree.xpath('//ul[class=""="products-grid"]//p[class=""="old-price"]//span[class=""="price"]/text()')
newPrices = tree.xpath('//ul[class=""="products-grid"]//p[class=""="special-price"]//span[class=""="price"]/text()')
discount = tree.xpath('//ul[class=""="products-grid"]//span[class=""="discount_Span"]/text()')
imagePath = tree.xpath('//ul[class=""="products-grid"]//img[class=""="b-lazy"]//@data-src')
#imagePath = tree.xpath('//img/@data-src')

limit = 0;
for arrayRange in productName:
    limit = limit + 1

def index_exists(ls, i):
    return (0 <= i < len(ls)) or (-len(ls) <= i < 0)

pName = "";
pBrand = "";
pOldPrices = "";
pNewPrices = "";
pDiscount = "";
pImagePath = "";

cursor = cnx.cursor()
for indexNumber in range(limit):
    if index_exists(productName, indexNumber):
        pName = productName[indexNumber]
    else:
        pName = "NA"
    if index_exists(productBrand, indexNumber):
        pBrand = productBrand[indexNumber];
    else:
        pBrand = "NA"
    if index_exists(oldPrices, indexNumber):
        pOldPrices = re.sub('\s+','',oldPrices[indexNumber])
    else:
        pOldPrices = "NA"
    if index_exists(newPrices, indexNumber):
         pNewPrices = re.sub('\s+','',newPrices[indexNumber])
    else:
        pNewPrices = "NA"
    if index_exists(discount, indexNumber):
        pDiscount = discount[indexNumber]
    else:
        pDiscount = "NA"
    if index_exists(imagePath, indexNumber):
        pImagePath = imagePath[indexNumber]
    else:
        pImagePath = "NA"

	print(pName)
    print(pBrand)
    print(pOldPrices)
    print(pNewPrices)
    print(pDiscount)
    print("https://www.example-website.com/"+pImagePath)
    cursor.execute("INSERT INTO data(product_name, product_brand, old_price, new_price, discount, image_path)VALUES"
                   "('"+pName+"','"+pBrand+"','"+pOldPrices+"','"+pNewPrices+"','"+pDiscount+"','"+pImagePath+"')")
    print("--------Data saved in MySQL Database--------")
cnx.close()

print("All Data is saved in MySQL Database")
Output
product name ...
product brand ...
product price ...
product discount ...
product image URL ...
--------Data saved in MySQL Database--------
product name ...
product brand ...
product price ...
product discount ...
product image URL ...
--------Data saved in MySQL Database--------
All Data is saved in MySQL Database

Scraping Data and Saving in MySQL Database another Example

Program

from lxml import html
import requests
import re
import mysql.connector

cnx = mysql.connector.connect(user='root', password='',
                              host='127.0.0.1',
                              database='scraping')

page = requests.get('https://example-website.com/grocery-and-staples?p=1')
tree = html.fromstring(page.content)

parent_div_xpath = '//li[class=""="item product product-item"]//div[class=""="product-item-info"]'
filtered_html = tree.xpath(parent_div_xpath)

productName = "div[class=""='product details product-item-details']/strong[class=""='product name product-item-name']/a[class=""='product-item-link']/text()"
oldPrice = "div[class=""='product details product-item-details']/div[class=""='price-box price-final_price']/span[class=""='old-price']/span[class=""='price-container price-final_price tax weee']/span[class=""='price-wrapper ']/span[class=""='price']/text()"
newPrice = "div[class=""='product details product-item-details']//div[class=""='price-box price-final_price']//span[class=""='price-container price-final_price tax weee']//span[class=""='price-wrapper ']//span[class=""='price']/text()"
discount = "div[class=""='product details product-item-details']/div[class=""='price-box price-final_price']/span[class=""='special-percent']/text()"
imageUrl = "div[class=""='product photo product-item-photo']/a/img[class=""='product-image-photo default_image']/@src"

arrProductName = [];
arrOldPrice = [];
arrNewPrice = [];
arrDiscount = [];
arrImageUrl = [];

for index in filtered_html:
    arrProductName.append(index.xpath("concat(normalize-space("+productName+"[contains('','')]),'')"))
    arrOldPrice.append(index.xpath("concat(normalize-space("+oldPrice+"[contains('Rs','Rs')]),'')"))
    arrNewPrice.append(index.xpath("concat(normalize-space("+newPrice+"[contains('Rs','Rs')]),'')"))
    arrDiscount.append(index.xpath("concat(normalize-space("+discount+"[contains('','')]),'')"))
    arrImageUrl.append(index.xpath("concat(normalize-space("+imageUrl+"[contains('','')]),'')"))

limit = 0;
for arrayRange in arrProductName:
    limit = limit + 1

def isNotEmpty(s):
    return bool(s and s.strip())

cursor = cnx.cursor()
for indexNumber in range(limit):
    if isNotEmpty(arrProductName[indexNumber]):
        pName = arrProductName[indexNumber]
    else:
        pName = "NA"
    if isNotEmpty(arrOldPrice[indexNumber]):
        pOldPrices = arrOldPrice[indexNumber]
    else:
        pOldPrices = "NA"
    if isNotEmpty(arrNewPrice[indexNumber]):
         pNewPrices = arrNewPrice[indexNumber]
    else:
        pNewPrices = "NA"
    if isNotEmpty(arrDiscount[indexNumber]):
        pDiscount = arrDiscount[indexNumber]
    else:
        pDiscount = "NA"
    if isNotEmpty(arrImageUrl[indexNumber]):
        pImageUrl = arrImageUrl[indexNumber]
    else:
        pImageUrl = "NA"

    print(pName)
    print(pOldPrices)
    print(pNewPrices)
    print(pDiscount)
    print(pImageUrl)

    cursor.execute("INSERT INTO data(product_name, old_price, new_price, discount, image_url) VALUES "
                   "('"+pName+"','"+pOldPrices+"','"+pNewPrices+"','"+pDiscount+"','"+pImageUrl+"')")
    print("--------Data saved in MySQL Database--------")
cnx.close()
print("--------All Data is saved in MySQL Database--------")

# ZIP EQUAL LENGTH LISTS
#xpath_list = list(zip(arrProductName, arrOldPrice, arrNewPrice, arrDiscount, arrImageUrl))
#print(xpath_list)
Output
product name ...
product old price ...
product new price ...
product discount ...
product image URL ...
--------Data saved in MySQL Database--------
product name ...
product old price ...
product new price ...
product discount ...
product image URL ...
--------Data saved in MySQL Database--------
--------All Data is saved in MySQL Database--------

Note: The above tutorials are created for educational and learning purposes.