Thực hành vòng lặp lồng nhau trong Python web crawler (2.1)

Viết một hàm nhận ba tham số (book: tên sách, title: tiêu đề, content: nội dung). Hàm này sẽ tạo thư mục book nếu chưa tồn tại, sau đó tạo file title.txt trong thư mục đó và ghi nội dung vào file.

import os

def write_content_to_file(book_name, chapter_title, chapter_content):
    # Tạo thư mục nếu chưa tồn tại
    if not os.path.isdir(book_name):
        os.makedirs(book_name)
    
    # Đường dẫn file đầy đủ
    file_full_path = os.path.join(book_name, f"{chapter_title}.txt")
    
    # Ghi nội dung vào file
    with open(file_full_path, 'w', encoding='utf-8') as f:
        f.write(chapter_content)

# Ví dụ sử dụng
folder_name = "SachHay"
chapter_name = "Chuong1"
chapter_data = "Đây là nội dung của chương 1."
write_content_to_file(folder_name, chapter_name, chapter_data)

Từ một trang web sách, viết các hàm để lấy (tất cả sách, tất cả chương, tất cả nội dung). Trả về dưới dạng dictionary và ghi vào file output.json cục bộ.

import random
import time
import json

def fetch_page(url):
    # Hàm này thực hiện request HTTP và parse HTML, trả về đối tượng tree
    # (Cần cài đặt chi tiết xử lý mạng và phân tích HTML)
    pass
    return tree

def extract_books(tree):
    # Hàm này trích xuất danh sách sách từ tree, trả về dictionary {tên_sách: url_sách}
    # (Cần cài đặt chi tiết logic trích xuất)
    pass
    return {book_title: book_link}

def get_chapters(book_url):
    # Hàm này lấy danh sách chương từ url_sách, trả về dictionary {tên_chương: url_chương}
    # (Cần cài đặt chi tiết logic trích xuất)
    pass
    return {chapter_title: chapter_link}

def get_chapter_content(title, chapter_url):
    # Hàm này lấy nội dung chương từ url_chương, trả về dictionary {tên_chương: nội_dung}
    # (Cần cài đặt chi tiết logic trích xuất)
    pass
    return {title: text_content}

def run_crawler():
    start_url = 'https://www.example.com/novels'
    all_data = {}
    
    for book_name, book_url in extract_books(fetch_page(start_url)).items():
        print(f"Đang xử lý sách: {book_name}")
        book_content = {}
        for chapter_name, chapter_url in get_chapters(fetch_page(book_url)).items():
            chapter_data = get_chapter_content(chapter_name, fetch_page(chapter_url))
            book_content.update(chapter_data)
            print(f"  Đã tải: {chapter_name}")
            time.sleep(random.uniform(1, 3))  # Tránh bị chặn
        
        all_data[book_name] = book_content
    
    with open('output.json', 'w', encoding='utf-8') as f:
        json.dump(all_data, f, ensure_ascii=False, indent=2)

if __name__ == '__main__':
    run_crawler()

Với dictionary chứa cấu trúc thư mục sau:

catalog = {
    'SachA': {'Chuong1': 'Noi dung 1', 'Chuong2': 'Noi dung 2'},
    'SachB': {'Chuong1': 'Noi dung 1', 'Chuong2': 'Noi dung 2'}
}

Để đọc dữ liệu từ file JSON và chuyển đổi thành dictionary, sử dụng json.load().

import json

with open('output.json', 'r', encoding='utf-8') as f:
    loaded_data = json.load(f)

print(loaded_data)  # In ra dictionary đã load

Để ghi dictionary cấu trúc thư mục vào các file \SachX\ChuongX.txt tương ứng:

import os

data = {
    'SachA': {'Chuong1': 'Noi dung 1', 'Chuong2': 'Noi dung 2'},
    'SachB': {'Chuong1': 'Noi dung 1', 'Chuong2': 'Noi dung 2'}
}

def save_dictionary_to_files(dict_data):
    for book, chapters in dict_data.items():
        if not os.path.exists(book):
            os.makedirs(book)
        print(f'Đang tạo thư mục: {book}')
        
        for chapter, content in chapters.items():
            file_path = os.path.join(book, f"{chapter}.txt")
            with open(file_path, 'w', encoding='utf-8') as f:
                f.write(f"{chapter}\n")
                f.write(content)
            print(f'  Đã ghi file: {chapter}')

save_dictionary_to_files(data)

Thẻ: python Web Crawler JSON File I/O os module

Đăng vào ngày 18 tháng 6 lúc 21:41

Thành phố Cuồng loạn

Thực hành vòng lặp lồng nhau trong Python web crawler (2.1)

Thẻ Phổ Biến