import xml.etree.ElementTree as ET # Library to parse XML data
import openai # OpenAI API library for interacting with GPT models
import json # Library to handle JSON data
import csv # Library to handle CSV files
from requests.auth import HTTPBasicAuth # For HTTP Basic Authentication
import re # Regular expressions library for pattern matching
WEBSITE = “https://www.kantspel.se” # Base URL of the website
USERNAME = “Lucas” # Username for authentication
PASSWORD = “pk9Q v587 suNm Uxvd zOsJ PCmp” # Password for authentication
SITEMAP_URL = WEBSITE + “/post-sitemap.xml” # URL to the website’s sitemap
OPENAI_API_KEY = “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” # OpenAI API Key
AUTH = HTTPBasicAuth(USERNAME, PASSWORD) # Authentication object using HTTP Basic Auth
def read_csv_file(file_path):
"""
Reads a CSV file and returns a list of dictionaries, each representing a row.
:param file_path: Path to the CSV file.
:return: List of dictionaries containing CSV data.
"""
with open(file_path, mode='r') as file:
reader = csv.DictReader(file) # Initialize CSV DictReader
return list(reader) # Convert reader to list and return
def get_post_content(post_id):
"""
Fetches the content of a WordPress post by its ID.
:param post_id: ID of the post to fetch.
:return: Rendered HTML content of the post or None if an error occurs.
"""
try:
url = f"{WEBSITE}/wp-json/wp/v2/posts/{post_id}" # Construct API endpoint URL
response = requests.get(url, auth=AUTH) # Make GET request with authentication
response.raise_for_status() # Raise exception for HTTP errors
content = response.json()['content']['rendered'] # Extract rendered content from JSON response
return content # Return the post content
except requests.RequestException as e:
print(f"Error fetching post content: {e}") # Print error message
return None # Return None on failure
def get_post_title(post_id):
"""
Retrieves the title of a WordPress post by its ID.
:param post_id: ID of the post.
:return: Title of the post or None if an error occurs.
"""
try:
response = requests.get(f"{WEBSITE}/wp-json/wp/v2/posts/{post_id}", auth=AUTH) # Make GET request
response.raise_for_status() # Raise exception for HTTP errors
return response.json()['title']['rendered'] # Extract and return the rendered title
except requests.RequestException as e:
print(f"Error fetching post title: {e}") # Print error message
return None # Return None on failure
def get_posts_from_sitemap(current_post_url):
"""
Parses the sitemap to retrieve a list of post URLs, excluding the current post and the first two posts.
:param current_post_url: URL of the current post to exclude.
:return: List of post URLs.
"""
try:
response = requests.get(SITEMAP_URL) # Fetch the sitemap XML
response.raise_for_status() # Raise exception for HTTP errors
sitemap = ET.fromstring(response.content) # Parse XML content
namespaces = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'} # Define XML namespaces
current_post_url = current_post_url.rstrip('/') # Normalize current post URL by removing trailing slash
# Extract all URLs from the sitemap, exclude the current post, and skip the first two entries
return [url.text for url in sitemap.findall('ns:url/ns:loc', namespaces)
if url.text.rstrip('/') != current_post_url][2:]
except requests.RequestException as e:
print(f"Error fetching posts from sitemap: {e}") # Print error message
return [] # Return empty list on failure
def extract_keywords_and_intent(text):
"""
Extracts keywords and intent from a given text using OpenAI's API.
:param text: The text to analyze.
:return: Dictionary containing 'keywords' and 'intent'.
"""
prompt = (
"Extract the main keywords and the intent of the following text. "
"Return the result in JSON format with two fields: 'keywords' (a list of relevant keywords) "
"and 'intent' (a brief description of the main intent).\n\n"
f"Text: {text}"
)
openai.api_key = OPENAI_API_KEY # Ensure this is correctly set
messages = [
{"role": "system", "content": "You are a knowledgeable assistant skilled in text analysis."},
{"role": "user", "content": prompt},
]
try:
response = openai.ChatCompletion.create(
model="gpt-4o-mini", # Use the appropriate model
messages=messages,
max_tokens=150,
temperature=0.3
)
response_content = response.choices.message['content'].strip()
# Ensure the response starts with a JSON object
if not response_content.startswith('{'):
first_brace = response_content.find('{')
if first_brace != -1:
response_content = response_content[first_brace:]
else:
raise ValueError("No JSON object found in the response.")
data = json.loads(response_content)
return data
except json.JSONDecodeError as e:
print(f"JSON decoding error in extract_keywords_and_intent: {e}. Response Content: {response_content}")
return {"keywords": [], "intent": ""}
except Exception as e:
print(f"Error extracting keywords and intent: {e}")
return {"keywords": [], "intent": ""}
def ask_openai(prompt):
"""
Sends a prompt to OpenAI's ChatCompletion API and retrieves the response.
:param prompt: The prompt string to send to OpenAI.
:return: The content of OpenAI's response or None if an error occurs.
"""
openai.api_key = OPENAI_API_KEY # Set the OpenAI API key
messages = [
{"role": "system", "content": "You are a helpful assistant."}, # System message defining assistant behavior
{"role": "user", "content": prompt}, # User's prompt
]
try:
response = openai.ChatCompletion.create(
model="gpt-4o-mini", # Specify the model to use
messages=messages, # Pass the messages list
max_tokens=4096, # Maximum number of tokens in the response
temperature=0.3 # Controls the randomness of the response
)
return response.choices.message['content'] # Return the content of the first choice
except Exception as e:
print(f"Error in OpenAI request: {e}") # Print error message
return None # Return None on failure
def lister(post_id, post_url):
"""
Main processing function that gathers post data, extracts keywords and intent,
interacts with OpenAI to get article ratings, and returns a list of URLs with high relevance ratings.
:param post_id: ID of the post being processed.
:param post_url: URL of the post being processed.
:return: Tuple containing a list of relevant URLs and the post content.
"""
print(f"Processing in lister - Post ID: {post_id}, Post URL: {post_url}") # Debugging information
post_title = get_post_title(post_id) # Retrieve the post title
post_content = get_post_content(post_id) # Retrieve the post content
if not post_content:
print(f"No content found for Post ID {post_id}. Skipping.")
return [], None
print(f"Post Content for ID {post_id}: {post_content[:100]}...") # Print first 100 characters of content for debugging
post_urls = get_posts_from_sitemap(post_url) # Get list of related post URLs from sitemap
print(f"URLs from Sitemap for Post ID {post_id}: {post_urls}") # Debugging information
with open('internal_link_finder.txt', 'r', encoding='utf-8') as file:
custom_prompt_template = file.read() # Read the custom prompt template from a file
# Extract keywords and intent from the current post
analysis = extract_keywords_and_intent(post_content)
post_keywords = ', '.join(analysis.get('keywords', []))
post_intent = analysis.get('intent', '')
# Replace placeholders in the prompt template with actual post data
custom_prompt = custom_prompt_template.replace("[POST_TITLE]", post_title)
custom_prompt = custom_prompt.replace("[POST_CONTENT]", post_content)
custom_prompt = custom_prompt.replace("[POST_KEYWORDS]", post_keywords)
custom_prompt = custom_prompt.replace("[POST_INTENT]", post_intent)
custom_prompt = custom_prompt.replace("[POST_URLS]", ', '.join(post_urls))
# Append instructions to enforce JSON response format
custom_prompt += (
"\n\nPlease provide the response in strictly valid JSON format as follows:\n"
"{\n"
" \"article_ratings\": [\n"
" {\"url\": \"URL1\", \"relevance_rating\": 8, \"keywords\": [\"keyword1\", \"keyword2\"], \"intent\": \"intent1\"},\n"
" {\"url\": \"URL2\", \"relevance_rating\": 9, \"keywords\": [\"keyword3\", \"keyword4\"], \"intent\": \"intent2\"},\n"
" {\"url\": \"URL3\", \"relevance_rating\": 7, \"keywords\": [\"keyword5\", \"keyword6\"], \"intent\": \"intent3\"}\n"
" ]\n"
"}"
)
openai_response = ask_openai(custom_prompt) # Send the prompt to OpenAI and get the response
if not openai_response:
print(f"OpenAI did not return a valid response for Post ID {post_id}.")
return [], post_content # Return empty list and post content
# Use regex to extract JSON content from the OpenAI response
json_pattern = re.compile(r'\{.*\}', re.DOTALL) # Pattern to match JSON objects
match = json_pattern.search(openai_response) # Search for JSON in the response
if not match:
print(f"No JSON found in OpenAI response for Post ID {post_id}. Response: {openai_response}")
return [], post_content # Return empty list and post content
json_string = match.group() # Extract the matched JSON string
try:
response_json = json.loads(json_string) # Parse the JSON string into a Python dictionary
except json.JSONDecodeError as e:
print(f"JSON decoding error for Post ID {post_id}: {e}. JSON string: {json_string}")
return [], post_content # Return empty list and post content
article_ratings = response_json.get('article_ratings', []) # Extract the 'article_ratings' list from JSON
# Initialize a list to store URLs with high relevance ratings
url_list = []
for rating_entry in article_ratings:
url = rating_entry.get('url') # Get the URL from the rating entry
rating = rating_entry.get('relevance_rating', 0) # Get the relevance rating, default to 0 if missing
if url and rating >= 8: # Check if URL exists and rating is 8 or higher
url_list.append(url) # Add to the list
# Debugging information
print(f"Custom Prompt: {custom_prompt}")
print(f"OpenAI Response: {openai_response}")
print(f"Final URL List: {url_list}")
return url_list, post_content # Return the list of relevant URLs and the post content