...
print(sec['title'])
print("Image URLs count:", len(parse_html(article_content)['image_urls']))
print("Image URLs:", parse_html(article_content)['image_urls'][:3])
Print references lines
print("References:") for ref in parse_html(article_content)['references'][:5]:print("- " + ref)
''''''
print('Done')
'''
That is the final JSON. Let's produce final output.'''
We will output dictionary in JSON format. Let's convert to JSON string with json.dumps and pretty indent.'''
But the requirement: "The output should be a single line JSON string that contains the fields ...". Usually the platform expects a single line JSON (no pretty prints). We'll produce compact JSON. We'll use json.dumps with separators=(',',':'). Let's produce final JSON.from json import dumps, loads
import re
from bs4 import BeautifulSoup
Assuming article_content is the given HTML string.
def parse_article(html: str) -> dict:
soup = BeautifulSoup(html, 'html.parser')
def get_text(el):
return el.get_text(separator=' ', strip=True) if el else ""
# 1. Title
title_tag = soup.find('h1', class_='title')
title = get_text(title_tag)
# 2. Author(s)
authors_div = soup.find('div', class_='authors')
authors = get_text(authors_div)
# 3. Abstract
abstract_div = soup.find('div', class_='abstract')
abstract = get_text(abstract_div)
# 4. Keywords
keywords_div = soup.find('div', class_='keywords')
keywords = get_text(keywords_div)
# 5. Image URLs
images = []
for img in soup.find_all('img'):
if img.get('src'):
images.append(img['src'])
for figure in soup.find_all('figure'):
img = figure.find('img')
if img and img.get('src'):
images.append(img['src'])
for div in soup.find_all('div', class_='image'):
img = div.find('img')
if img and img.get('src'):
images.append(img['src'])
image_urls = images
# 6. References
references = []
No comments yet. Be the first to comment!