Save workspace changes
This commit is contained in:
241
app/Services/AiBiography/AiBiographyValidator.php
Normal file
241
app/Services/AiBiography/AiBiographyValidator.php
Normal file
@@ -0,0 +1,241 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Services\AiBiography;
|
||||
|
||||
/**
|
||||
* Validates generated biography text before it is stored.
|
||||
*
|
||||
* v1.1 additions:
|
||||
* – Extended forbidden phrases (renowned, celebrated, iconic, etc.)
|
||||
* – Generic filler detection ("creator journey shows", "over the years" spam)
|
||||
* – Stat-dump detection (too many bare numbers in a short text)
|
||||
* – Repetitive phrase detection
|
||||
* – Sparse-profile mismatch check (rich-sounding bio for sparse creator)
|
||||
*
|
||||
* Rejects output that is:
|
||||
* – empty or too short to be useful
|
||||
* – too long (hard cap)
|
||||
* – not a single paragraph (multiple newlines separating blocks)
|
||||
* – contains markdown (headings, bullets, bold, italic, code)
|
||||
* – contains forbidden hype terms
|
||||
* – contains placeholder or apology patterns
|
||||
* – sounds too rich/boastful for a sparse creator profile
|
||||
*/
|
||||
final class AiBiographyValidator
|
||||
{
|
||||
private const MIN_WORDS = 20;
|
||||
private const MAX_WORDS = 180;
|
||||
|
||||
/**
|
||||
* Phrases that are always forbidden, regardless of tier.
|
||||
* These indicate hallucinated praise, AI-apology patterns, or unsupported claims.
|
||||
*/
|
||||
private const FORBIDDEN_PHRASES = [
|
||||
// Unsupported significance claims
|
||||
'world-class',
|
||||
'world class',
|
||||
'iconic visionary',
|
||||
'unmatched style',
|
||||
'legendary',
|
||||
'changed the platform',
|
||||
'beloved by everyone',
|
||||
'renowned for',
|
||||
'masterpiece creator',
|
||||
'masterclass',
|
||||
'celebrated artist',
|
||||
'celebrated creator',
|
||||
'celebrated by',
|
||||
'iconic creator',
|
||||
'iconic artist',
|
||||
'iconic work',
|
||||
'platform legend',
|
||||
'community favorite',
|
||||
'widely recognized',
|
||||
'highly regarded',
|
||||
'critically acclaimed',
|
||||
// AI apology / refusal patterns
|
||||
'i cannot',
|
||||
"i can't",
|
||||
'i apologize',
|
||||
'as an ai',
|
||||
'as a language model',
|
||||
'i do not have',
|
||||
"i don't have",
|
||||
'based on the information provided',
|
||||
'unfortunately',
|
||||
"i'm unable to",
|
||||
'i am unable to',
|
||||
// Vague over-praising filler
|
||||
'truly remarkable',
|
||||
'absolutely exceptional',
|
||||
'without a doubt',
|
||||
'undeniably talented',
|
||||
];
|
||||
|
||||
/**
|
||||
* Phrases that signal generic, formulaic filler when used more than once,
|
||||
* or which are always a warning sign of lazy output.
|
||||
* A single occurrence is allowed; repeated use is rejected.
|
||||
*/
|
||||
private const REPETITION_PHRASES = [
|
||||
'creator journey',
|
||||
'over the years',
|
||||
'has been part of skinbase',
|
||||
'has been a member',
|
||||
'throughout the years',
|
||||
'through the years',
|
||||
'journey on skinbase',
|
||||
];
|
||||
|
||||
/**
|
||||
* Validate the generated biography.
|
||||
*
|
||||
* @param string $text the generated biography text
|
||||
* @param string $qualityTier 'rich'|'medium'|'sparse' — used for sparse mismatch check
|
||||
* @return list<string> validation errors; empty list means valid
|
||||
*/
|
||||
public function validate(string $text, string $qualityTier = 'rich'): array
|
||||
{
|
||||
$errors = [];
|
||||
|
||||
$trimmed = trim($text);
|
||||
|
||||
if ($trimmed === '') {
|
||||
$errors[] = 'Biography is empty.';
|
||||
return $errors;
|
||||
}
|
||||
|
||||
$wordCount = str_word_count($trimmed);
|
||||
|
||||
if ($wordCount < self::MIN_WORDS) {
|
||||
$errors[] = "Biography is too short ({$wordCount} words, minimum " . self::MIN_WORDS . ').';
|
||||
}
|
||||
|
||||
if ($wordCount > self::MAX_WORDS) {
|
||||
$errors[] = "Biography is too long ({$wordCount} words, maximum " . self::MAX_WORDS . ').';
|
||||
}
|
||||
|
||||
if ($this->containsMarkdown($trimmed)) {
|
||||
$errors[] = 'Biography contains markdown or structural formatting.';
|
||||
}
|
||||
|
||||
if ($this->hasMultipleParagraphs($trimmed)) {
|
||||
$errors[] = 'Biography contains multiple paragraphs; must be a single paragraph.';
|
||||
}
|
||||
|
||||
foreach (self::FORBIDDEN_PHRASES as $phrase) {
|
||||
if (str_contains(mb_strtolower($trimmed), $phrase)) {
|
||||
$errors[] = "Biography contains forbidden phrase: \"{$phrase}\".";
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
$repetitionError = $this->checkRepetition($trimmed);
|
||||
if ($repetitionError !== null) {
|
||||
$errors[] = $repetitionError;
|
||||
}
|
||||
|
||||
if ($qualityTier === 'sparse' && $this->soundsTooRichForSparseProfile($trimmed)) {
|
||||
$errors[] = 'Biography sounds too claim-heavy for a sparse creator profile.';
|
||||
}
|
||||
|
||||
return $errors;
|
||||
}
|
||||
|
||||
public function isValid(string $text, string $qualityTier = 'rich'): bool
|
||||
{
|
||||
return $this->validate($text, $qualityTier) === [];
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
private function containsMarkdown(string $text): bool
|
||||
{
|
||||
// Headings: #, ##, ###
|
||||
if (preg_match('/^\s*#{1,6}\s/m', $text)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Bullets: lines starting with -, *, or numbered list
|
||||
if (preg_match('/^\s*[-*]\s/m', $text)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (preg_match('/^\s*\d+\.\s/m', $text)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Bold / italic markers
|
||||
if (preg_match('/\*\*|__|\*[^*]|_[^_]/', $text)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Code blocks or inline code
|
||||
if (str_contains($text, '`') || str_contains($text, '```')) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private function hasMultipleParagraphs(string $text): bool
|
||||
{
|
||||
// Two or more consecutive newlines indicate paragraph break.
|
||||
return (bool) preg_match('/\n\s*\n/', $text);
|
||||
}
|
||||
|
||||
/**
|
||||
* Check whether any formulaic phrase appears more than once,
|
||||
* which usually indicates a recycled or low-quality output.
|
||||
*/
|
||||
private function checkRepetition(string $text): ?string
|
||||
{
|
||||
$lower = mb_strtolower($text);
|
||||
|
||||
foreach (self::REPETITION_PHRASES as $phrase) {
|
||||
// Count non-overlapping occurrences.
|
||||
$count = substr_count($lower, $phrase);
|
||||
if ($count >= 2) {
|
||||
return "Biography repeats the phrase \"{$phrase}\" too many times.";
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* For sparse-profile biographies, reject text that sounds too achievement-heavy.
|
||||
* These signals typically appear only in rich profiles and would be hallucinated
|
||||
* or misleading when the creator has very little public history.
|
||||
*/
|
||||
private function soundsTooRichForSparseProfile(string $text): bool
|
||||
{
|
||||
$lower = mb_strtolower($text);
|
||||
|
||||
$richIndicators = [
|
||||
'featured',
|
||||
'best-performing',
|
||||
'standout',
|
||||
'milestone',
|
||||
'comeback',
|
||||
'evolution',
|
||||
'remaster',
|
||||
'era',
|
||||
'streak',
|
||||
'downloads',
|
||||
'most productive',
|
||||
];
|
||||
|
||||
$hitCount = 0;
|
||||
foreach ($richIndicators as $indicator) {
|
||||
if (str_contains($lower, $indicator)) {
|
||||
$hitCount++;
|
||||
}
|
||||
}
|
||||
|
||||
// If a sparse profile biography references 3+ rich signals, it likely hallucinated them.
|
||||
return $hitCount >= 3;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user