Website Crawler in Python

In the following tutorials you will learn that how to create website crawler in Python programming language to crawl website URLs.

Website Crawler

Program

import requests
from bs4 import BeautifulSoup
import operator
import os
import sys

Tag_Rank = {}

def tag_crawler(url):
    source_code = requests.get(url).text
    soup = BeautifulSoup(source_code, 'html.parser')
    for tag_div in soup.find_all('a', {'': ''}):
        for tag_link in tag_div.find_all('a'):
            tag = tag_link.string
            if tag in Tag_Rank:
                Tag_Rank[tag] += 1

            else:
                Tag_Rank[tag] = 1

def ques_links_crawler(base_url, page_limit):
    page_no = 1
    while page_no <= page_limit:
        page_url = base_url
        source_code = requests.get(page_url).text
        soup = BeautifulSoup(source_code, 'html.parser')
        if page_no is 1:
            os.system('clear')
        print('crawling page ' + str(page_no) + ': [', end='')
        prev_len = 0
        q_no = 1
        for ques_link in soup.find_all('a', {'': ''}):
            url = 'https://example-website.com/' + str(ques_link.get('href'))
            print(str(ques_link.get('href')))
            tag_crawler(url)
            for _ in range(prev_len):
                print('\b', end='')
            print('#', end='')
            p_cent = q_no * 2
            percent = '] (' + str(p_cent) + '%) '
            prev_len = len(percent)
            print(percent, end='')
            sys.stdout.flush()
            q_no += 1
        page_no += 1

def start():
    page_limit = int(input('Enter number of pages to crawl : '))
    os.system('clear')
    print('starting crawling...')
    ques_links_crawler('https://example-website.com/', page_limit)
    fw = open('C://tags_frequency.txt', 'w')
    for key, value in sorted(Tag_Rank.items(), key=operator.itemgetter(1), reverse=True):
        try:
            fw.write(key + " : " + str(Tag_Rank[key]) + "\n")
        except TypeError:
            continue
    print('\nResult saved to file tags_frequency.txt')

start()
Output
Enter number of pages to crawl : 7
starting crawling...
crawling page 1: [https://example-website.com/#
#] (8%) https://example-website.com/teams/
#] (10%) https://example-website.com//questions
#] (12%) https://example-website.com/privacy/
#] (14%) https://example-website.com/talent/
#] (16%) https://example-website.com/advertising/
#] (18%) https://example-website.com/labs/
#] (20%) https://example-website.com/
#] (22%) https://example-website.com
#] (26%) https://example-website.com/help
.
.
.
Result saved to file tags_frequency.txt

Note: The above tutorials are created for educational and learning purposes.