xref: /haiku/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.cpp (revision 7bdeef54a24d3417300f251af891df962b638b9b)
1 /*
2  * Copyright 2006-2010, Haiku, Inc. All Rights Reserved.
3  * Distributed under the terms of the MIT License.
4  *
5  * Authors:
6  *		Andrew Galante, haiku.galante@gmail.com
7  *		Axel Dörfler, axeld@pinc-software.de
8  *		Hugo Santos, hugosantos@gmail.com
9  */
10 
11 
12 #include "TCPEndpoint.h"
13 
14 #include <netinet/in.h>
15 #include <netinet/ip.h>
16 #include <netinet/tcp.h>
17 #include <new>
18 #include <signal.h>
19 #include <stdlib.h>
20 #include <string.h>
21 
22 #include <KernelExport.h>
23 #include <Select.h>
24 
25 #include <net_buffer.h>
26 #include <net_datalink.h>
27 #include <net_stat.h>
28 #include <NetBufferUtilities.h>
29 #include <NetUtilities.h>
30 
31 #include <lock.h>
32 #include <tracing.h>
33 #include <util/AutoLock.h>
34 #include <util/list.h>
35 
36 #include "EndpointManager.h"
37 
38 
39 // References:
40 //	- RFC 793 - Transmission Control Protocol
41 //	- RFC 813 - Window and Acknowledgement Strategy in TCP
42 //	- RFC 1337 - TIME_WAIT Assassination Hazards in TCP
43 //
44 // Things this implementation currently doesn't implement:
45 //	- TCP Slow Start, Congestion Avoidance, Fast Retransmit, and Fast Recovery,
46 //	  RFC 2001, RFC 2581, RFC 3042
47 //	- NewReno Modification to TCP's Fast Recovery, RFC 2582
48 //	- Explicit Congestion Notification (ECN), RFC 3168
49 //	- SYN-Cache
50 //	- SACK, Selective Acknowledgment - RFC 2018, RFC 2883, RFC 3517
51 //	- Forward RTO-Recovery, RFC 4138
52 //	- Time-Wait hash instead of keeping sockets alive
53 //
54 // Things incomplete in this implementation:
55 //	- TCP Extensions for High Performance, RFC 1323 - RTTM, PAWS
56 
57 #define PrintAddress(address) \
58 	AddressString(Domain(), address, true).Data()
59 
60 //#define TRACE_TCP
61 //#define PROBE_TCP
62 
63 #ifdef TRACE_TCP
64 // the space before ', ##args' is important in order for this to work with cpp 2.95
65 #	define TRACE(format, args...)	dprintf("%" B_PRId32 ": TCP [%" \
66 		B_PRIdBIGTIME "] %p (%12s) " format "\n", find_thread(NULL), \
67 		system_time(), this, name_for_state(fState) , ##args)
68 #else
69 #	define TRACE(args...)			do { } while (0)
70 #endif
71 
72 #ifdef PROBE_TCP
73 #	define PROBE(buffer, window) \
74 	dprintf("TCP PROBE %" B_PRIdBIGTIME " %s %s %" B_PRIu32 " snxt %" B_PRIu32 \
75 		" suna %" B_PRIu32 " cw %" B_PRIu32 " sst %" B_PRIu32 " win %" \
76 		B_PRIu32 " swin %" B_PRIu32 " smax-suna %" B_PRIu32 " savail %" \
77 		B_PRIuSIZE " sqused %" B_PRIuSIZE " rto %" B_PRIdBIGTIME "\n", \
78 		system_time(), PrintAddress(buffer->source), \
79 		PrintAddress(buffer->destination), buffer->size, fSendNext.Number(), \
80 		fSendUnacknowledged.Number(), fCongestionWindow, fSlowStartThreshold, \
81 		window, fSendWindow, (fSendMax - fSendUnacknowledged).Number(), \
82 		fSendQueue.Available(fSendNext), fSendQueue.Used(), fRetransmitTimeout)
83 #else
84 #	define PROBE(buffer, window)	do { } while (0)
85 #endif
86 
87 #if TCP_TRACING
88 namespace TCPTracing {
89 
90 class Receive : public AbstractTraceEntry {
91 public:
92 	Receive(TCPEndpoint* endpoint, tcp_segment_header& segment, uint32 window,
93 			net_buffer* buffer)
94 		:
95 		fEndpoint(endpoint),
96 		fBuffer(buffer),
97 		fBufferSize(buffer->size),
98 		fSequence(segment.sequence),
99 		fAcknowledge(segment.acknowledge),
100 		fWindow(window),
101 		fState(endpoint->State()),
102 		fFlags(segment.flags)
103 	{
104 		Initialized();
105 	}
106 
107 	virtual void AddDump(TraceOutput& out)
108 	{
109 		out.Print("tcp:%p (%12s) receive buffer %p (%" B_PRIu32 " bytes), "
110 			"flags %#" B_PRIx8 ", seq %" B_PRIu32 ", ack %" B_PRIu32
111 			", wnd %" B_PRIu32, fEndpoint, name_for_state(fState), fBuffer,
112 			fBufferSize, fFlags, fSequence, fAcknowledge, fWindow);
113 	}
114 
115 protected:
116 	TCPEndpoint*	fEndpoint;
117 	net_buffer*		fBuffer;
118 	uint32			fBufferSize;
119 	uint32			fSequence;
120 	uint32			fAcknowledge;
121 	uint32			fWindow;
122 	tcp_state		fState;
123 	uint8			fFlags;
124 };
125 
126 class Send : public AbstractTraceEntry {
127 public:
128 	Send(TCPEndpoint* endpoint, tcp_segment_header& segment, net_buffer* buffer,
129 			tcp_sequence firstSequence, tcp_sequence lastSequence)
130 		:
131 		fEndpoint(endpoint),
132 		fBuffer(buffer),
133 		fBufferSize(buffer->size),
134 		fSequence(segment.sequence),
135 		fAcknowledge(segment.acknowledge),
136 		fFirstSequence(firstSequence.Number()),
137 		fLastSequence(lastSequence.Number()),
138 		fState(endpoint->State()),
139 		fFlags(segment.flags)
140 	{
141 		Initialized();
142 	}
143 
144 	virtual void AddDump(TraceOutput& out)
145 	{
146 		out.Print("tcp:%p (%12s) send buffer %p (%" B_PRIu32 " bytes), "
147 			"flags %#" B_PRIx8 ", seq %" B_PRIu32 ", ack %" B_PRIu32
148 			", first %" B_PRIu32 ", last %" B_PRIu32, fEndpoint,
149 			name_for_state(fState), fBuffer, fBufferSize, fFlags, fSequence,
150 			fAcknowledge, fFirstSequence, fLastSequence);
151 	}
152 
153 protected:
154 	TCPEndpoint*	fEndpoint;
155 	net_buffer*		fBuffer;
156 	uint32			fBufferSize;
157 	uint32			fSequence;
158 	uint32			fAcknowledge;
159 	uint32			fFirstSequence;
160 	uint32			fLastSequence;
161 	tcp_state		fState;
162 	uint8			fFlags;
163 };
164 
165 class State : public AbstractTraceEntry {
166 public:
167 	State(TCPEndpoint* endpoint)
168 		:
169 		fEndpoint(endpoint),
170 		fState(endpoint->State())
171 	{
172 		Initialized();
173 	}
174 
175 	virtual void AddDump(TraceOutput& out)
176 	{
177 		out.Print("tcp:%p (%12s) state change", fEndpoint,
178 			name_for_state(fState));
179 	}
180 
181 protected:
182 	TCPEndpoint*	fEndpoint;
183 	tcp_state		fState;
184 };
185 
186 class Spawn : public AbstractTraceEntry {
187 public:
188 	Spawn(TCPEndpoint* listeningEndpoint, TCPEndpoint* spawnedEndpoint)
189 		:
190 		fListeningEndpoint(listeningEndpoint),
191 		fSpawnedEndpoint(spawnedEndpoint)
192 	{
193 		Initialized();
194 	}
195 
196 	virtual void AddDump(TraceOutput& out)
197 	{
198 		out.Print("tcp:%p spawns %p", fListeningEndpoint, fSpawnedEndpoint);
199 	}
200 
201 protected:
202 	TCPEndpoint*	fListeningEndpoint;
203 	TCPEndpoint*	fSpawnedEndpoint;
204 };
205 
206 class Error : public AbstractTraceEntry {
207 public:
208 	Error(TCPEndpoint* endpoint, const char* error, int32 line)
209 		:
210 		fEndpoint(endpoint),
211 		fLine(line),
212 		fError(error),
213 		fState(endpoint->State())
214 	{
215 		Initialized();
216 	}
217 
218 	virtual void AddDump(TraceOutput& out)
219 	{
220 		out.Print("tcp:%p (%12s) error at line %" B_PRId32 ": %s", fEndpoint,
221 			name_for_state(fState), fLine, fError);
222 	}
223 
224 protected:
225 	TCPEndpoint*	fEndpoint;
226 	int32			fLine;
227 	const char*		fError;
228 	tcp_state		fState;
229 };
230 
231 class TimerSet : public AbstractTraceEntry {
232 public:
233 	TimerSet(TCPEndpoint* endpoint, const char* which, bigtime_t timeout)
234 		:
235 		fEndpoint(endpoint),
236 		fWhich(which),
237 		fTimeout(timeout),
238 		fState(endpoint->State())
239 	{
240 		Initialized();
241 	}
242 
243 	virtual void AddDump(TraceOutput& out)
244 	{
245 		out.Print("tcp:%p (%12s) %s timer set to %" B_PRIdBIGTIME, fEndpoint,
246 			name_for_state(fState), fWhich, fTimeout);
247 	}
248 
249 protected:
250 	TCPEndpoint*	fEndpoint;
251 	const char*		fWhich;
252 	bigtime_t		fTimeout;
253 	tcp_state		fState;
254 };
255 
256 class TimerTriggered : public AbstractTraceEntry {
257 public:
258 	TimerTriggered(TCPEndpoint* endpoint, const char* which)
259 		:
260 		fEndpoint(endpoint),
261 		fWhich(which),
262 		fState(endpoint->State())
263 	{
264 		Initialized();
265 	}
266 
267 	virtual void AddDump(TraceOutput& out)
268 	{
269 		out.Print("tcp:%p (%12s) %s timer triggered", fEndpoint,
270 			name_for_state(fState), fWhich);
271 	}
272 
273 protected:
274 	TCPEndpoint*	fEndpoint;
275 	const char*		fWhich;
276 	tcp_state		fState;
277 };
278 
279 class APICall : public AbstractTraceEntry {
280 public:
281 	APICall(TCPEndpoint* endpoint, const char* which)
282 		:
283 		fEndpoint(endpoint),
284 		fWhich(which),
285 		fState(endpoint->State())
286 	{
287 		Initialized();
288 	}
289 
290 	virtual void AddDump(TraceOutput& out)
291 	{
292 		out.Print("tcp:%p (%12s) api call: %s", fEndpoint,
293 			name_for_state(fState), fWhich);
294 	}
295 
296 protected:
297 	TCPEndpoint*	fEndpoint;
298 	const char*		fWhich;
299 	tcp_state		fState;
300 };
301 
302 }	// namespace TCPTracing
303 
304 #	define T(x)	new(std::nothrow) TCPTracing::x
305 #else
306 #	define T(x)
307 #endif	// TCP_TRACING
308 
309 
310 // constants for the fFlags field
311 enum {
312 	FLAG_OPTION_WINDOW_SCALE	= 0x01,
313 	FLAG_OPTION_TIMESTAMP		= 0x02,
314 	// TODO: Should FLAG_NO_RECEIVE apply as well to received connections?
315 	//       That is, what is expected from accept() after a shutdown()
316 	//       is performed on a listen()ing socket.
317 	FLAG_NO_RECEIVE				= 0x04,
318 	FLAG_CLOSED					= 0x08,
319 	FLAG_DELETE_ON_CLOSE		= 0x10,
320 	FLAG_LOCAL					= 0x20
321 };
322 
323 
324 static const int kTimestampFactor = 1000;
325 	// conversion factor between usec system time and msec tcp time
326 
327 
328 static inline bigtime_t
329 absolute_timeout(bigtime_t timeout)
330 {
331 	if (timeout == 0 || timeout == B_INFINITE_TIMEOUT)
332 		return timeout;
333 
334 	return timeout + system_time();
335 }
336 
337 
338 static inline status_t
339 posix_error(status_t error)
340 {
341 	if (error == B_TIMED_OUT)
342 		return B_WOULD_BLOCK;
343 
344 	return error;
345 }
346 
347 
348 static inline bool
349 in_window(const tcp_sequence& sequence, const tcp_sequence& receiveNext,
350 	uint32 receiveWindow)
351 {
352 	return sequence >= receiveNext && sequence < (receiveNext + receiveWindow);
353 }
354 
355 
356 static inline bool
357 segment_in_sequence(const tcp_segment_header& segment, int size,
358 	const tcp_sequence& receiveNext, uint32 receiveWindow)
359 {
360 	tcp_sequence sequence(segment.sequence);
361 	if (size == 0) {
362 		if (receiveWindow == 0)
363 			return sequence == receiveNext;
364 		return in_window(sequence, receiveNext, receiveWindow);
365 	} else {
366 		if (receiveWindow == 0)
367 			return false;
368 		return in_window(sequence, receiveNext, receiveWindow)
369 			|| in_window(sequence + size - 1, receiveNext, receiveWindow);
370 	}
371 }
372 
373 
374 static inline bool
375 is_writable(tcp_state state)
376 {
377 	return state == ESTABLISHED || state == FINISH_RECEIVED;
378 }
379 
380 
381 static inline bool
382 is_establishing(tcp_state state)
383 {
384 	return state == SYNCHRONIZE_SENT || state == SYNCHRONIZE_RECEIVED;
385 }
386 
387 
388 static inline uint32 tcp_now()
389 {
390 	return system_time() / kTimestampFactor;
391 }
392 
393 
394 static inline uint32 tcp_diff_timestamp(uint32 base)
395 {
396 	uint32 now = tcp_now();
397 
398 	if (now > base)
399 		return now - base;
400 
401 	return now + UINT_MAX - base;
402 }
403 
404 
405 static inline bool
406 state_needs_finish(int32 state)
407 {
408 	return state == WAIT_FOR_FINISH_ACKNOWLEDGE
409 		|| state == FINISH_SENT || state == CLOSING;
410 }
411 
412 
413 //	#pragma mark -
414 
415 
416 TCPEndpoint::TCPEndpoint(net_socket* socket)
417 	:
418 	ProtocolSocket(socket),
419 	fManager(NULL),
420 	fOptions(0),
421 	fSendWindowShift(0),
422 	fReceiveWindowShift(0),
423 	fSendUnacknowledged(0),
424 	fSendNext(0),
425 	fSendMax(0),
426 	fSendUrgentOffset(0),
427 	fSendWindow(0),
428 	fSendMaxWindow(0),
429 	fSendMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE),
430 	fSendQueue(socket->send.buffer_size),
431 	fInitialSendSequence(0),
432 	fDuplicateAcknowledgeCount(0),
433 	fRoute(NULL),
434 	fReceiveNext(0),
435 	fReceiveMaxAdvertised(0),
436 	fReceiveWindow(socket->receive.buffer_size),
437 	fReceiveMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE),
438 	fReceiveQueue(socket->receive.buffer_size),
439 	fRoundTripTime(TCP_INITIAL_RTT / kTimestampFactor),
440 	fRoundTripDeviation(TCP_INITIAL_RTT / kTimestampFactor),
441 	fRetransmitTimeout(TCP_INITIAL_RTT),
442 	fReceivedTimestamp(0),
443 	fCongestionWindow(0),
444 	fSlowStartThreshold(0),
445 	fState(CLOSED),
446 	fFlags(FLAG_OPTION_WINDOW_SCALE | FLAG_OPTION_TIMESTAMP)
447 {
448 	// TODO: to be replaced with a real read/write locking strategy!
449 	mutex_init(&fLock, "tcp lock");
450 
451 	fReceiveCondition.Init(this, "tcp receive");
452 	fSendCondition.Init(this, "tcp send");
453 
454 	gStackModule->init_timer(&fPersistTimer, TCPEndpoint::_PersistTimer, this);
455 	gStackModule->init_timer(&fRetransmitTimer, TCPEndpoint::_RetransmitTimer,
456 		this);
457 	gStackModule->init_timer(&fDelayedAcknowledgeTimer,
458 		TCPEndpoint::_DelayedAcknowledgeTimer, this);
459 	gStackModule->init_timer(&fTimeWaitTimer, TCPEndpoint::_TimeWaitTimer,
460 		this);
461 
462 	T(APICall(this, "constructor"));
463 }
464 
465 
466 TCPEndpoint::~TCPEndpoint()
467 {
468 	mutex_lock(&fLock);
469 
470 	T(APICall(this, "destructor"));
471 
472 	_CancelConnectionTimers();
473 	gStackModule->cancel_timer(&fTimeWaitTimer);
474 	T(TimerSet(this, "time-wait", -1));
475 
476 	if (fManager != NULL) {
477 		fManager->Unbind(this);
478 		put_endpoint_manager(fManager);
479 	}
480 
481 	mutex_destroy(&fLock);
482 
483 	// we need to wait for all timers to return
484 	gStackModule->wait_for_timer(&fRetransmitTimer);
485 	gStackModule->wait_for_timer(&fPersistTimer);
486 	gStackModule->wait_for_timer(&fDelayedAcknowledgeTimer);
487 	gStackModule->wait_for_timer(&fTimeWaitTimer);
488 
489 	gDatalinkModule->put_route(Domain(), fRoute);
490 }
491 
492 
493 status_t
494 TCPEndpoint::InitCheck() const
495 {
496 	return B_OK;
497 }
498 
499 
500 //	#pragma mark - protocol API
501 
502 
503 status_t
504 TCPEndpoint::Open()
505 {
506 	TRACE("Open()");
507 	T(APICall(this, "open"));
508 
509 	status_t status = ProtocolSocket::Open();
510 	if (status < B_OK)
511 		return status;
512 
513 	fManager = get_endpoint_manager(Domain());
514 	if (fManager == NULL)
515 		return EAFNOSUPPORT;
516 
517 	return B_OK;
518 }
519 
520 
521 status_t
522 TCPEndpoint::Close()
523 {
524 	MutexLocker locker(fLock);
525 
526 	TRACE("Close()");
527 	T(APICall(this, "close"));
528 
529 	if (fState == LISTEN)
530 		delete_sem(fAcceptSemaphore);
531 
532 	if (fState == SYNCHRONIZE_SENT || fState == LISTEN) {
533 		// TODO: what about linger in case of SYNCHRONIZE_SENT?
534 		fState = CLOSED;
535 		T(State(this));
536 		return B_OK;
537 	}
538 
539 	status_t status = _Disconnect(true);
540 	if (status != B_OK)
541 		return status;
542 
543 	if (socket->options & SO_LINGER) {
544 		TRACE("Close(): Lingering for %i secs", socket->linger);
545 
546 		bigtime_t maximum = absolute_timeout(socket->linger * 1000000LL);
547 
548 		while (fSendQueue.Used() > 0) {
549 			status = _WaitForCondition(fSendCondition, locker, maximum);
550 			if (status == B_TIMED_OUT || status == B_WOULD_BLOCK)
551 				break;
552 			else if (status < B_OK)
553 				return status;
554 		}
555 
556 		TRACE("Close(): after waiting, the SendQ was left with %" B_PRIuSIZE
557 			" bytes.", fSendQueue.Used());
558 	}
559 	return B_OK;
560 }
561 
562 
563 void
564 TCPEndpoint::Free()
565 {
566 	MutexLocker _(fLock);
567 
568 	TRACE("Free()");
569 	T(APICall(this, "free"));
570 
571 	if (fState <= SYNCHRONIZE_SENT)
572 		return;
573 
574 	// we are only interested in the timer, not in changing state
575 	_EnterTimeWait();
576 
577 	fFlags |= FLAG_CLOSED;
578 	if ((fFlags & FLAG_DELETE_ON_CLOSE) == 0) {
579 		// we'll be freed later when the 2MSL timer expires
580 		gSocketModule->acquire_socket(socket);
581 	}
582 }
583 
584 
585 /*!	Creates and sends a synchronize packet to /a address, and then waits
586 	until the connection has been established or refused.
587 */
588 status_t
589 TCPEndpoint::Connect(const sockaddr* address)
590 {
591 	if (!AddressModule()->is_same_family(address))
592 		return EAFNOSUPPORT;
593 
594 	MutexLocker locker(fLock);
595 
596 	TRACE("Connect() on address %s", PrintAddress(address));
597 	T(APICall(this, "connect"));
598 
599 	if (gStackModule->is_restarted_syscall()) {
600 		bigtime_t timeout = gStackModule->restore_syscall_restart_timeout();
601 		status_t status = _WaitForEstablished(locker, timeout);
602 		TRACE("  Connect(): Connection complete: %s (timeout was %"
603 			B_PRIdBIGTIME ")", strerror(status), timeout);
604 		return posix_error(status);
605 	}
606 
607 	// Can only call connect() from CLOSED or LISTEN states
608 	// otherwise endpoint is considered already connected
609 	if (fState == LISTEN) {
610 		// this socket is about to connect; remove pending connections in the backlog
611 		gSocketModule->set_max_backlog(socket, 0);
612 	} else if (fState == ESTABLISHED) {
613 		return EISCONN;
614 	} else if (fState != CLOSED)
615 		return EINPROGRESS;
616 
617 	// consider destination address INADDR_ANY as INADDR_LOOPBACK
618 	sockaddr_storage _address;
619 	if (AddressModule()->is_empty_address(address, false)) {
620 		AddressModule()->get_loopback_address((sockaddr *)&_address);
621 		// for IPv4 and IPv6 the port is at the same offset
622 		((sockaddr_in &)_address).sin_port = ((sockaddr_in *)address)->sin_port;
623 		address = (sockaddr *)&_address;
624 	}
625 
626 	status_t status = _PrepareSendPath(address);
627 	if (status < B_OK)
628 		return status;
629 
630 	TRACE("  Connect(): starting 3-way handshake...");
631 
632 	fState = SYNCHRONIZE_SENT;
633 	T(State(this));
634 
635 	// send SYN
636 	status = _SendQueued();
637 	if (status != B_OK) {
638 		_Close();
639 		return status;
640 	}
641 
642 	// If we are running over Loopback, after _SendQueued() returns we
643 	// may be in ESTABLISHED already.
644 	if (fState == ESTABLISHED) {
645 		TRACE("  Connect() completed after _SendQueued()");
646 		return B_OK;
647 	}
648 
649 	// wait until 3-way handshake is complete (if needed)
650 	bigtime_t timeout = min_c(socket->send.timeout, TCP_CONNECTION_TIMEOUT);
651 	if (timeout == 0) {
652 		// we're a non-blocking socket
653 		TRACE("  Connect() delayed, return EINPROGRESS");
654 		return EINPROGRESS;
655 	}
656 
657 	bigtime_t absoluteTimeout = absolute_timeout(timeout);
658 	gStackModule->store_syscall_restart_timeout(absoluteTimeout);
659 
660 	status = _WaitForEstablished(locker, absoluteTimeout);
661 	TRACE("  Connect(): Connection complete: %s (timeout was %" B_PRIdBIGTIME
662 		")", strerror(status), timeout);
663 	return posix_error(status);
664 }
665 
666 
667 status_t
668 TCPEndpoint::Accept(struct net_socket** _acceptedSocket)
669 {
670 	MutexLocker locker(fLock);
671 
672 	TRACE("Accept()");
673 	T(APICall(this, "accept"));
674 
675 	status_t status;
676 	bigtime_t timeout = absolute_timeout(socket->receive.timeout);
677 	if (gStackModule->is_restarted_syscall())
678 		timeout = gStackModule->restore_syscall_restart_timeout();
679 	else
680 		gStackModule->store_syscall_restart_timeout(timeout);
681 
682 	do {
683 		locker.Unlock();
684 
685 		status = acquire_sem_etc(fAcceptSemaphore, 1, B_ABSOLUTE_TIMEOUT
686 			| B_CAN_INTERRUPT, timeout);
687 		if (status != B_OK) {
688 			if (status == B_TIMED_OUT && socket->receive.timeout == 0)
689 				return B_WOULD_BLOCK;
690 
691 			return status;
692 		}
693 
694 		locker.Lock();
695 		status = gSocketModule->dequeue_connected(socket, _acceptedSocket);
696 #ifdef TRACE_TCP
697 		if (status == B_OK)
698 			TRACE("  Accept() returning %p", (*_acceptedSocket)->first_protocol);
699 #endif
700 	} while (status != B_OK);
701 
702 	return status;
703 }
704 
705 
706 status_t
707 TCPEndpoint::Bind(const sockaddr *address)
708 {
709 	if (address == NULL)
710 		return B_BAD_VALUE;
711 
712 	MutexLocker lock(fLock);
713 
714 	TRACE("Bind() on address %s", PrintAddress(address));
715 	T(APICall(this, "bind"));
716 
717 	if (fState != CLOSED)
718 		return EISCONN;
719 
720 	return fManager->Bind(this, address);
721 }
722 
723 
724 status_t
725 TCPEndpoint::Unbind(struct sockaddr *address)
726 {
727 	MutexLocker _(fLock);
728 
729 	TRACE("Unbind()");
730 	T(APICall(this, "unbind"));
731 
732 	return fManager->Unbind(this);
733 }
734 
735 
736 status_t
737 TCPEndpoint::Listen(int count)
738 {
739 	MutexLocker _(fLock);
740 
741 	TRACE("Listen()");
742 	T(APICall(this, "listen"));
743 
744 	if (fState != CLOSED && fState != LISTEN)
745 		return B_BAD_VALUE;
746 
747 	if (fState == CLOSED) {
748 		fAcceptSemaphore = create_sem(0, "tcp accept");
749 		if (fAcceptSemaphore < B_OK)
750 			return ENOBUFS;
751 
752 		status_t status = fManager->SetPassive(this);
753 		if (status != B_OK) {
754 			delete_sem(fAcceptSemaphore);
755 			fAcceptSemaphore = -1;
756 			return status;
757 		}
758 	}
759 
760 	gSocketModule->set_max_backlog(socket, count);
761 
762 	fState = LISTEN;
763 	T(State(this));
764 	return B_OK;
765 }
766 
767 
768 status_t
769 TCPEndpoint::Shutdown(int direction)
770 {
771 	MutexLocker lock(fLock);
772 
773 	TRACE("Shutdown(%i)", direction);
774 	T(APICall(this, "shutdown"));
775 
776 	if (direction == SHUT_RD || direction == SHUT_RDWR)
777 		fFlags |= FLAG_NO_RECEIVE;
778 
779 	if (direction == SHUT_WR || direction == SHUT_RDWR) {
780 		// TODO: That's not correct. After read/write shutting down the socket
781 		// one should still be able to read previously arrived data.
782 		_Disconnect(false);
783 	}
784 
785 	return B_OK;
786 }
787 
788 
789 /*!	Puts data contained in \a buffer into send buffer */
790 status_t
791 TCPEndpoint::SendData(net_buffer *buffer)
792 {
793 	MutexLocker lock(fLock);
794 
795 	TRACE("SendData(buffer %p, size %" B_PRIu32 ", flags %#" B_PRIx32
796 		") [total %" B_PRIuSIZE " bytes, has %" B_PRIuSIZE "]", buffer,
797 		buffer->size, buffer->flags, fSendQueue.Size(), fSendQueue.Free());
798 	T(APICall(this, "senddata"));
799 
800 	uint32 flags = buffer->flags;
801 
802 	if (fState == CLOSED)
803 		return ENOTCONN;
804 	if (fState == LISTEN)
805 		return EDESTADDRREQ;
806 	if (!is_writable(fState) && !is_establishing(fState)) {
807 		// we only send signals when called from userland
808 		if (gStackModule->is_syscall() && (flags & MSG_NOSIGNAL) == 0)
809 			send_signal(find_thread(NULL), SIGPIPE);
810 		return EPIPE;
811 	}
812 
813 	size_t left = buffer->size;
814 
815 	bigtime_t timeout = absolute_timeout(socket->send.timeout);
816 	if (gStackModule->is_restarted_syscall())
817 		timeout = gStackModule->restore_syscall_restart_timeout();
818 	else
819 		gStackModule->store_syscall_restart_timeout(timeout);
820 
821 	while (left > 0) {
822 		while (fSendQueue.Free() < socket->send.low_water_mark) {
823 			// wait until enough space is available
824 			status_t status = _WaitForCondition(fSendCondition, lock, timeout);
825 			if (status < B_OK) {
826 				TRACE("  SendData() returning %s (%d)",
827 					strerror(posix_error(status)), (int)posix_error(status));
828 				return posix_error(status);
829 			}
830 
831 			if (!is_writable(fState) && !is_establishing(fState)) {
832 				// we only send signals when called from userland
833 				if (gStackModule->is_syscall())
834 					send_signal(find_thread(NULL), SIGPIPE);
835 				return EPIPE;
836 			}
837 		}
838 
839 		size_t size = fSendQueue.Free();
840 		if (size < left) {
841 			// we need to split the original buffer
842 			net_buffer* clone = gBufferModule->clone(buffer, false);
843 				// TODO: add offset/size parameter to net_buffer::clone() or
844 				// even a move_data() function, as this is a bit inefficient
845 			if (clone == NULL)
846 				return ENOBUFS;
847 
848 			status_t status = gBufferModule->trim(clone, size);
849 			if (status != B_OK) {
850 				gBufferModule->free(clone);
851 				return status;
852 			}
853 
854 			gBufferModule->remove_header(buffer, size);
855 			left -= size;
856 			fSendQueue.Add(clone);
857 		} else {
858 			left -= buffer->size;
859 			fSendQueue.Add(buffer);
860 		}
861 	}
862 
863 	TRACE("  SendData(): %" B_PRIuSIZE " bytes used.", fSendQueue.Used());
864 
865 	bool force = false;
866 	if ((flags & MSG_OOB) != 0) {
867 		fSendUrgentOffset = fSendQueue.LastSequence();
868 			// RFC 961 specifies that the urgent offset points to the last
869 			// byte of urgent data. However, this is commonly implemented as
870 			// here, ie. it points to the first byte after the urgent data.
871 		force = true;
872 	}
873 	if ((flags & MSG_EOF) != 0)
874 		_Disconnect(false);
875 
876 	if (fState == ESTABLISHED || fState == FINISH_RECEIVED)
877 		_SendQueued(force);
878 
879 	return B_OK;
880 }
881 
882 
883 ssize_t
884 TCPEndpoint::SendAvailable()
885 {
886 	MutexLocker locker(fLock);
887 
888 	ssize_t available;
889 
890 	if (is_writable(fState))
891 		available = fSendQueue.Free();
892 	else if (is_establishing(fState))
893 		available = 0;
894 	else
895 		available = EPIPE;
896 
897 	TRACE("SendAvailable(): %" B_PRIdSSIZE, available);
898 	T(APICall(this, "sendavailable"));
899 	return available;
900 }
901 
902 
903 status_t
904 TCPEndpoint::FillStat(net_stat *stat)
905 {
906 	MutexLocker _(fLock);
907 
908 	strlcpy(stat->state, name_for_state(fState), sizeof(stat->state));
909 	stat->receive_queue_size = fReceiveQueue.Available();
910 	stat->send_queue_size = fSendQueue.Used();
911 
912 	return B_OK;
913 }
914 
915 
916 status_t
917 TCPEndpoint::ReadData(size_t numBytes, uint32 flags, net_buffer** _buffer)
918 {
919 	MutexLocker locker(fLock);
920 
921 	TRACE("ReadData(%" B_PRIuSIZE " bytes, flags %#" B_PRIx32 ")", numBytes,
922 		flags);
923 	T(APICall(this, "readdata"));
924 
925 	*_buffer = NULL;
926 
927 	if (fState == CLOSED)
928 		return ENOTCONN;
929 
930 	bigtime_t timeout = absolute_timeout(socket->receive.timeout);
931 	if (gStackModule->is_restarted_syscall())
932 		timeout = gStackModule->restore_syscall_restart_timeout();
933 	else
934 		gStackModule->store_syscall_restart_timeout(timeout);
935 
936 	if (fState == SYNCHRONIZE_SENT || fState == SYNCHRONIZE_RECEIVED) {
937 		if (flags & MSG_DONTWAIT)
938 			return B_WOULD_BLOCK;
939 
940 		status_t status = _WaitForEstablished(locker, timeout);
941 		if (status < B_OK)
942 			return posix_error(status);
943 	}
944 
945 	size_t dataNeeded = socket->receive.low_water_mark;
946 
947 	// When MSG_WAITALL is set then the function should block
948 	// until the full amount of data can be returned.
949 	if (flags & MSG_WAITALL)
950 		dataNeeded = numBytes;
951 
952 	// TODO: add support for urgent data (MSG_OOB)
953 
954 	while (true) {
955 		if (fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE
956 			|| fState == TIME_WAIT) {
957 			// ``Connection closing''.
958 			return B_OK;
959 		}
960 
961 		if (fReceiveQueue.Available() > 0) {
962 			if (fReceiveQueue.Available() >= dataNeeded
963 				|| (fReceiveQueue.PushedData() > 0
964 					&& fReceiveQueue.PushedData() >= fReceiveQueue.Available()))
965 				break;
966 		} else if (fState == FINISH_RECEIVED) {
967 			// ``If no text is awaiting delivery, the RECEIVE will
968 			//   get a Connection closing''.
969 			return B_OK;
970 		}
971 
972 		if ((flags & MSG_DONTWAIT) != 0 || socket->receive.timeout == 0)
973 			return B_WOULD_BLOCK;
974 
975 		if ((fFlags & FLAG_NO_RECEIVE) != 0)
976 			return B_OK;
977 
978 		status_t status = _WaitForCondition(fReceiveCondition, locker, timeout);
979 		if (status < B_OK) {
980 			// The Open Group base specification mentions that EINTR should be
981 			// returned if the recv() is interrupted before _any data_ is
982 			// available. So we actually check if there is data, and if so,
983 			// push it to the user.
984 			if ((status == B_TIMED_OUT || status == B_INTERRUPTED)
985 				&& fReceiveQueue.Available() > 0)
986 				break;
987 
988 			return posix_error(status);
989 		}
990 	}
991 
992 	TRACE("  ReadData(): %" B_PRIuSIZE " are available.",
993 		fReceiveQueue.Available());
994 
995 	if (numBytes < fReceiveQueue.Available())
996 		fReceiveCondition.NotifyAll();
997 
998 	bool clone = (flags & MSG_PEEK) != 0;
999 
1000 	ssize_t receivedBytes = fReceiveQueue.Get(numBytes, !clone, _buffer);
1001 
1002 	TRACE("  ReadData(): %" B_PRIuSIZE " bytes kept.",
1003 		fReceiveQueue.Available());
1004 
1005 	// if we are opening the window, check if we should send an ACK
1006 	if (!clone)
1007 		SendAcknowledge(false);
1008 
1009 	return receivedBytes;
1010 }
1011 
1012 
1013 ssize_t
1014 TCPEndpoint::ReadAvailable()
1015 {
1016 	MutexLocker locker(fLock);
1017 
1018 	TRACE("ReadAvailable(): %" B_PRIdSSIZE, _AvailableData());
1019 	T(APICall(this, "readavailable"));
1020 
1021 	return _AvailableData();
1022 }
1023 
1024 
1025 status_t
1026 TCPEndpoint::SetSendBufferSize(size_t length)
1027 {
1028 	MutexLocker _(fLock);
1029 	fSendQueue.SetMaxBytes(length);
1030 	return B_OK;
1031 }
1032 
1033 
1034 status_t
1035 TCPEndpoint::SetReceiveBufferSize(size_t length)
1036 {
1037 	MutexLocker _(fLock);
1038 	fReceiveQueue.SetMaxBytes(length);
1039 	return B_OK;
1040 }
1041 
1042 
1043 status_t
1044 TCPEndpoint::GetOption(int option, void* _value, int* _length)
1045 {
1046 	if (*_length != sizeof(int))
1047 		return B_BAD_VALUE;
1048 
1049 	int* value = (int*)_value;
1050 
1051 	switch (option) {
1052 		case TCP_NODELAY:
1053 			if ((fOptions & TCP_NODELAY) != 0)
1054 				*value = 1;
1055 			else
1056 				*value = 0;
1057 			return B_OK;
1058 
1059 		case TCP_MAXSEG:
1060 			*value = fReceiveMaxSegmentSize;
1061 			return B_OK;
1062 
1063 		default:
1064 			return B_BAD_VALUE;
1065 	}
1066 }
1067 
1068 
1069 status_t
1070 TCPEndpoint::SetOption(int option, const void* _value, int length)
1071 {
1072 	if (option != TCP_NODELAY)
1073 		return B_BAD_VALUE;
1074 
1075 	if (length != sizeof(int))
1076 		return B_BAD_VALUE;
1077 
1078 	const int* value = (const int*)_value;
1079 
1080 	MutexLocker _(fLock);
1081 	if (*value)
1082 		fOptions |= TCP_NODELAY;
1083 	else
1084 		fOptions &= ~TCP_NODELAY;
1085 
1086 	return B_OK;
1087 }
1088 
1089 
1090 //	#pragma mark - misc
1091 
1092 
1093 bool
1094 TCPEndpoint::IsBound() const
1095 {
1096 	return !LocalAddress().IsEmpty(true);
1097 }
1098 
1099 
1100 bool
1101 TCPEndpoint::IsLocal() const
1102 {
1103 	return (fFlags & FLAG_LOCAL) != 0;
1104 }
1105 
1106 
1107 status_t
1108 TCPEndpoint::DelayedAcknowledge()
1109 {
1110 	if (gStackModule->cancel_timer(&fDelayedAcknowledgeTimer)) {
1111 		// timer was active, send an ACK now (with the exception above,
1112 		// we send every other ACK)
1113 		T(TimerSet(this, "delayed ack", -1));
1114 		return SendAcknowledge(true);
1115 	}
1116 
1117 	gStackModule->set_timer(&fDelayedAcknowledgeTimer,
1118 		TCP_DELAYED_ACKNOWLEDGE_TIMEOUT);
1119 	T(TimerSet(this, "delayed ack", TCP_DELAYED_ACKNOWLEDGE_TIMEOUT));
1120 	return B_OK;
1121 }
1122 
1123 
1124 status_t
1125 TCPEndpoint::SendAcknowledge(bool force)
1126 {
1127 	return _SendQueued(force, 0);
1128 }
1129 
1130 
1131 void
1132 TCPEndpoint::_StartPersistTimer()
1133 {
1134 	gStackModule->set_timer(&fPersistTimer, TCP_PERSIST_TIMEOUT);
1135 	T(TimerSet(this, "persist", TCP_PERSIST_TIMEOUT));
1136 }
1137 
1138 
1139 void
1140 TCPEndpoint::_EnterTimeWait()
1141 {
1142 	TRACE("_EnterTimeWait()");
1143 
1144 	if (fState == TIME_WAIT) {
1145 		_CancelConnectionTimers();
1146 
1147 		if (IsLocal()) {
1148 			// we do not use TIME_WAIT state for local connections
1149 			fFlags |= FLAG_DELETE_ON_CLOSE;
1150 			return;
1151 		}
1152 	}
1153 
1154 	_UpdateTimeWait();
1155 }
1156 
1157 
1158 void
1159 TCPEndpoint::_UpdateTimeWait()
1160 {
1161 	gStackModule->set_timer(&fTimeWaitTimer, TCP_MAX_SEGMENT_LIFETIME << 1);
1162 	T(TimerSet(this, "time-wait", TCP_MAX_SEGMENT_LIFETIME << 1));
1163 }
1164 
1165 
1166 void
1167 TCPEndpoint::_CancelConnectionTimers()
1168 {
1169 	gStackModule->cancel_timer(&fRetransmitTimer);
1170 	T(TimerSet(this, "retransmit", -1));
1171 	gStackModule->cancel_timer(&fPersistTimer);
1172 	T(TimerSet(this, "persist", -1));
1173 	gStackModule->cancel_timer(&fDelayedAcknowledgeTimer);
1174 	T(TimerSet(this, "delayed ack", -1));
1175 }
1176 
1177 
1178 /*!	Sends the FIN flag to the peer when the connection is still open.
1179 	Moves the endpoint to the next state depending on where it was.
1180 */
1181 status_t
1182 TCPEndpoint::_Disconnect(bool closing)
1183 {
1184 	tcp_state previousState = fState;
1185 
1186 	if (fState == SYNCHRONIZE_RECEIVED || fState == ESTABLISHED)
1187 		fState = FINISH_SENT;
1188 	else if (fState == FINISH_RECEIVED)
1189 		fState = WAIT_FOR_FINISH_ACKNOWLEDGE;
1190 	else
1191 		return B_OK;
1192 
1193 	T(State(this));
1194 
1195 	status_t status = _SendQueued();
1196 	if (status != B_OK) {
1197 		fState = previousState;
1198 		T(State(this));
1199 		return status;
1200 	}
1201 
1202 	return B_OK;
1203 }
1204 
1205 
1206 void
1207 TCPEndpoint::_MarkEstablished()
1208 {
1209 	fState = ESTABLISHED;
1210 	T(State(this));
1211 
1212 	if (gSocketModule->has_parent(socket)) {
1213 		gSocketModule->set_connected(socket);
1214 		release_sem_etc(fAcceptSemaphore, 1, B_DO_NOT_RESCHEDULE);
1215 	}
1216 
1217 	fSendCondition.NotifyAll();
1218 	gSocketModule->notify(socket, B_SELECT_WRITE, fSendQueue.Free());
1219 }
1220 
1221 
1222 status_t
1223 TCPEndpoint::_WaitForEstablished(MutexLocker &locker, bigtime_t timeout)
1224 {
1225 	// TODO: Checking for CLOSED seems correct, but breaks several neon tests.
1226 	// When investigating this, also have a look at _Close() and _HandleReset().
1227 	while (fState < ESTABLISHED/* && fState != CLOSED*/) {
1228 		if (socket->error != B_OK)
1229 			return socket->error;
1230 
1231 		status_t status = _WaitForCondition(fSendCondition, locker, timeout);
1232 		if (status < B_OK)
1233 			return status;
1234 	}
1235 
1236 	return B_OK;
1237 }
1238 
1239 
1240 //	#pragma mark - receive
1241 
1242 
1243 void
1244 TCPEndpoint::_Close()
1245 {
1246 	_CancelConnectionTimers();
1247 	fState = CLOSED;
1248 	T(State(this));
1249 
1250 	fFlags |= FLAG_DELETE_ON_CLOSE;
1251 
1252 	fSendCondition.NotifyAll();
1253 	_NotifyReader();
1254 
1255 	if (gSocketModule->has_parent(socket)) {
1256 		// We still have a parent - obviously, we haven't been accepted yet,
1257 		// so no one could ever close us.
1258 		_CancelConnectionTimers();
1259 		gSocketModule->set_aborted(socket);
1260 	}
1261 }
1262 
1263 
1264 void
1265 TCPEndpoint::_HandleReset(status_t error)
1266 {
1267 	socket->error = error;
1268 	_Close();
1269 
1270 	gSocketModule->notify(socket, B_SELECT_WRITE, error);
1271 	gSocketModule->notify(socket, B_SELECT_ERROR, error);
1272 }
1273 
1274 
1275 void
1276 TCPEndpoint::_DuplicateAcknowledge(tcp_segment_header &segment)
1277 {
1278 	if (++fDuplicateAcknowledgeCount < 3)
1279 		return;
1280 
1281 	if (fDuplicateAcknowledgeCount == 3) {
1282 		_ResetSlowStart();
1283 		fCongestionWindow = fSlowStartThreshold + 3 * fSendMaxSegmentSize;
1284 		fSendNext = segment.acknowledge;
1285 	} else if (fDuplicateAcknowledgeCount > 3)
1286 		fCongestionWindow += fSendMaxSegmentSize;
1287 
1288 	_SendQueued();
1289 }
1290 
1291 
1292 void
1293 TCPEndpoint::_UpdateTimestamps(tcp_segment_header& segment,
1294 	size_t segmentLength)
1295 {
1296 	if (fFlags & FLAG_OPTION_TIMESTAMP) {
1297 		tcp_sequence sequence(segment.sequence);
1298 
1299 		if (fLastAcknowledgeSent >= sequence
1300 			&& fLastAcknowledgeSent < (sequence + segmentLength))
1301 			fReceivedTimestamp = segment.timestamp_value;
1302 	}
1303 }
1304 
1305 
1306 ssize_t
1307 TCPEndpoint::_AvailableData() const
1308 {
1309 	// TODO: Refer to the FLAG_NO_RECEIVE comment above regarding
1310 	//       the application of FLAG_NO_RECEIVE in listen()ing
1311 	//       sockets.
1312 	if (fState == LISTEN)
1313 		return gSocketModule->count_connected(socket);
1314 	if (fState == SYNCHRONIZE_SENT)
1315 		return 0;
1316 
1317 	ssize_t availableData = fReceiveQueue.Available();
1318 
1319 	if (availableData == 0 && !_ShouldReceive())
1320 		return ENOTCONN;
1321 
1322 	return availableData;
1323 }
1324 
1325 
1326 void
1327 TCPEndpoint::_NotifyReader()
1328 {
1329 	fReceiveCondition.NotifyAll();
1330 	gSocketModule->notify(socket, B_SELECT_READ, _AvailableData());
1331 }
1332 
1333 
1334 bool
1335 TCPEndpoint::_AddData(tcp_segment_header& segment, net_buffer* buffer)
1336 {
1337 	if ((segment.flags & TCP_FLAG_FINISH) != 0) {
1338 		// Remember the position of the finish received flag
1339 		fFinishReceived = true;
1340 		fFinishReceivedAt = segment.sequence + buffer->size;
1341 	}
1342 
1343 	fReceiveQueue.Add(buffer, segment.sequence);
1344 	fReceiveNext = fReceiveQueue.NextSequence();
1345 
1346 	if (fFinishReceived) {
1347 		// Set or reset the finish flag on the current segment
1348 		if (fReceiveNext < fFinishReceivedAt)
1349 			segment.flags &= ~TCP_FLAG_FINISH;
1350 		else
1351 			segment.flags |= TCP_FLAG_FINISH;
1352 	}
1353 
1354 	TRACE("  _AddData(): adding data, receive next = %" B_PRIu32 ". Now have %"
1355 		B_PRIuSIZE " bytes.", fReceiveNext.Number(), fReceiveQueue.Available());
1356 
1357 	if ((segment.flags & TCP_FLAG_PUSH) != 0)
1358 		fReceiveQueue.SetPushPointer();
1359 
1360 	return fReceiveQueue.Available() > 0;
1361 }
1362 
1363 
1364 void
1365 TCPEndpoint::_PrepareReceivePath(tcp_segment_header& segment)
1366 {
1367 	fInitialReceiveSequence = segment.sequence;
1368 	fFinishReceived = false;
1369 
1370 	// count the received SYN
1371 	segment.sequence++;
1372 
1373 	fReceiveNext = segment.sequence;
1374 	fReceiveQueue.SetInitialSequence(segment.sequence);
1375 
1376 	if ((fOptions & TCP_NOOPT) == 0) {
1377 		if (segment.max_segment_size > 0)
1378 			fSendMaxSegmentSize = segment.max_segment_size;
1379 
1380 		if (segment.options & TCP_HAS_WINDOW_SCALE) {
1381 			fFlags |= FLAG_OPTION_WINDOW_SCALE;
1382 			fSendWindowShift = segment.window_shift;
1383 		} else {
1384 			fFlags &= ~FLAG_OPTION_WINDOW_SCALE;
1385 			fReceiveWindowShift = 0;
1386 		}
1387 
1388 		if (segment.options & TCP_HAS_TIMESTAMPS) {
1389 			fFlags |= FLAG_OPTION_TIMESTAMP;
1390 			fReceivedTimestamp = segment.timestamp_value;
1391 		} else
1392 			fFlags &= ~FLAG_OPTION_TIMESTAMP;
1393 	}
1394 
1395 	fCongestionWindow = 2 * fSendMaxSegmentSize;
1396 	fSlowStartThreshold = (uint32)segment.advertised_window << fSendWindowShift;
1397 }
1398 
1399 
1400 bool
1401 TCPEndpoint::_ShouldReceive() const
1402 {
1403 	if ((fFlags & FLAG_NO_RECEIVE) != 0)
1404 		return false;
1405 
1406 	return fState == ESTABLISHED || fState == FINISH_SENT
1407 		|| fState == FINISH_ACKNOWLEDGED;
1408 }
1409 
1410 
1411 int32
1412 TCPEndpoint::_Spawn(TCPEndpoint* parent, tcp_segment_header& segment,
1413 	net_buffer* buffer)
1414 {
1415 	MutexLocker _(fLock);
1416 
1417 	// TODO error checking
1418 	ProtocolSocket::Open();
1419 
1420 	fState = SYNCHRONIZE_RECEIVED;
1421 	T(Spawn(parent, this));
1422 
1423 	fManager = parent->fManager;
1424 
1425 	LocalAddress().SetTo(buffer->destination);
1426 	PeerAddress().SetTo(buffer->source);
1427 
1428 	TRACE("Spawn()");
1429 
1430 	// TODO: proper error handling!
1431 	if (fManager->BindChild(this) != B_OK) {
1432 		T(Error(this, "binding failed", __LINE__));
1433 		return DROP;
1434 	}
1435 	if (_PrepareSendPath(*PeerAddress()) != B_OK) {
1436 		T(Error(this, "prepare send faild", __LINE__));
1437 		return DROP;
1438 	}
1439 
1440 	fOptions = parent->fOptions;
1441 	fAcceptSemaphore = parent->fAcceptSemaphore;
1442 
1443 	_PrepareReceivePath(segment);
1444 
1445 	// send SYN+ACK
1446 	if (_SendQueued() != B_OK) {
1447 		T(Error(this, "sending failed", __LINE__));
1448 		return DROP;
1449 	}
1450 
1451 	segment.flags &= ~TCP_FLAG_SYNCHRONIZE;
1452 		// we handled this flag now, it must not be set for further processing
1453 
1454 	return _Receive(segment, buffer);
1455 }
1456 
1457 
1458 int32
1459 TCPEndpoint::_ListenReceive(tcp_segment_header& segment, net_buffer* buffer)
1460 {
1461 	TRACE("ListenReceive()");
1462 
1463 	// Essentially, we accept only TCP_FLAG_SYNCHRONIZE in this state,
1464 	// but the error behaviour differs
1465 	if (segment.flags & TCP_FLAG_RESET)
1466 		return DROP;
1467 	if (segment.flags & TCP_FLAG_ACKNOWLEDGE)
1468 		return DROP | RESET;
1469 	if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0)
1470 		return DROP;
1471 
1472 	// TODO: drop broadcast/multicast
1473 
1474 	// spawn new endpoint for accept()
1475 	net_socket* newSocket;
1476 	if (gSocketModule->spawn_pending_socket(socket, &newSocket) < B_OK) {
1477 		T(Error(this, "spawning failed", __LINE__));
1478 		return DROP;
1479 	}
1480 
1481 	return ((TCPEndpoint *)newSocket->first_protocol)->_Spawn(this,
1482 		segment, buffer);
1483 }
1484 
1485 
1486 int32
1487 TCPEndpoint::_SynchronizeSentReceive(tcp_segment_header &segment,
1488 	net_buffer *buffer)
1489 {
1490 	TRACE("_SynchronizeSentReceive()");
1491 
1492 	if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0
1493 		&& (fInitialSendSequence >= segment.acknowledge
1494 			|| fSendMax < segment.acknowledge))
1495 		return DROP | RESET;
1496 
1497 	if (segment.flags & TCP_FLAG_RESET) {
1498 		_HandleReset(ECONNREFUSED);
1499 		return DROP;
1500 	}
1501 
1502 	if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0)
1503 		return DROP;
1504 
1505 	fSendUnacknowledged = segment.acknowledge;
1506 	_PrepareReceivePath(segment);
1507 
1508 	if (segment.flags & TCP_FLAG_ACKNOWLEDGE) {
1509 		_MarkEstablished();
1510 	} else {
1511 		// simultaneous open
1512 		fState = SYNCHRONIZE_RECEIVED;
1513 		T(State(this));
1514 	}
1515 
1516 	segment.flags &= ~TCP_FLAG_SYNCHRONIZE;
1517 		// we handled this flag now, it must not be set for further processing
1518 
1519 	return _Receive(segment, buffer) | IMMEDIATE_ACKNOWLEDGE;
1520 }
1521 
1522 
1523 int32
1524 TCPEndpoint::_Receive(tcp_segment_header& segment, net_buffer* buffer)
1525 {
1526 	uint32 advertisedWindow = (uint32)segment.advertised_window
1527 		<< fSendWindowShift;
1528 	size_t segmentLength = buffer->size;
1529 
1530 	// First, handle the most common case for uni-directional data transfer
1531 	// (known as header prediction - the segment must not change the window,
1532 	// and must be the expected sequence, and contain no control flags)
1533 
1534 	if (fState == ESTABLISHED
1535 		&& segment.AcknowledgeOnly()
1536 		&& fReceiveNext == segment.sequence
1537 		&& advertisedWindow > 0 && advertisedWindow == fSendWindow
1538 		&& fSendNext == fSendMax) {
1539 		_UpdateTimestamps(segment, segmentLength);
1540 
1541 		if (segmentLength == 0) {
1542 			// this is a pure acknowledge segment - we're on the sending end
1543 			if (fSendUnacknowledged < segment.acknowledge
1544 				&& fSendMax >= segment.acknowledge) {
1545 				_Acknowledged(segment);
1546 				return DROP;
1547 			}
1548 		} else if (segment.acknowledge == fSendUnacknowledged
1549 			&& fReceiveQueue.IsContiguous()
1550 			&& fReceiveQueue.Free() >= segmentLength
1551 			&& (fFlags & FLAG_NO_RECEIVE) == 0) {
1552 			if (_AddData(segment, buffer))
1553 				_NotifyReader();
1554 
1555 			return KEEP | ((segment.flags & TCP_FLAG_PUSH) != 0
1556 				? IMMEDIATE_ACKNOWLEDGE : ACKNOWLEDGE);
1557 		}
1558 	}
1559 
1560 	// The fast path was not applicable, so we continue with the standard
1561 	// processing of the incoming segment
1562 
1563 	ASSERT(fState != SYNCHRONIZE_SENT && fState != LISTEN);
1564 
1565 	if (fState != CLOSED && fState != TIME_WAIT) {
1566 		// Check sequence number
1567 		if (!segment_in_sequence(segment, segmentLength, fReceiveNext,
1568 				fReceiveWindow)) {
1569 			TRACE("  Receive(): segment out of window, next: %" B_PRIu32
1570 				" wnd: %" B_PRIu32, fReceiveNext.Number(), fReceiveWindow);
1571 			if ((segment.flags & TCP_FLAG_RESET) != 0) {
1572 				// TODO: this doesn't look right - review!
1573 				return DROP;
1574 			}
1575 			return DROP | IMMEDIATE_ACKNOWLEDGE;
1576 		}
1577 	}
1578 
1579 	if ((segment.flags & TCP_FLAG_RESET) != 0) {
1580 		// Is this a valid reset?
1581 		// We generally ignore resets in time wait state (see RFC 1337)
1582 		if (fLastAcknowledgeSent <= segment.sequence
1583 			&& tcp_sequence(segment.sequence) < (fLastAcknowledgeSent
1584 				+ fReceiveWindow)
1585 			&& fState != TIME_WAIT) {
1586 			status_t error;
1587 			if (fState == SYNCHRONIZE_RECEIVED)
1588 				error = ECONNREFUSED;
1589 			else if (fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE)
1590 				error = ENOTCONN;
1591 			else
1592 				error = ECONNRESET;
1593 
1594 			_HandleReset(error);
1595 		}
1596 
1597 		return DROP;
1598 	}
1599 
1600 	if ((segment.flags & TCP_FLAG_SYNCHRONIZE) != 0
1601 		|| (fState == SYNCHRONIZE_RECEIVED
1602 			&& (fInitialReceiveSequence > segment.sequence
1603 				|| ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0
1604 					&& (fSendUnacknowledged > segment.acknowledge
1605 						|| fSendMax < segment.acknowledge))))) {
1606 		// reset the connection - either the initial SYN was faulty, or we
1607 		// received a SYN within the data stream
1608 		return DROP | RESET;
1609 	}
1610 
1611 	// TODO: Check this! Why do we advertize a window outside of what we should
1612 	// buffer?
1613 	fReceiveWindow = max_c(fReceiveQueue.Free(), fReceiveWindow);
1614 		// the window must not shrink
1615 
1616 	// trim buffer to be within the receive window
1617 	int32 drop = (int32)(fReceiveNext - segment.sequence).Number();
1618 	if (drop > 0) {
1619 		if ((uint32)drop > buffer->size
1620 			|| ((uint32)drop == buffer->size
1621 				&& (segment.flags & TCP_FLAG_FINISH) == 0)) {
1622 			// don't accidently remove a FIN we shouldn't remove
1623 			segment.flags &= ~TCP_FLAG_FINISH;
1624 			drop = buffer->size;
1625 		}
1626 
1627 		// remove duplicate data at the start
1628 		TRACE("* remove %" B_PRId32 " bytes from the start", drop);
1629 		gBufferModule->remove_header(buffer, drop);
1630 		segment.sequence += drop;
1631 	}
1632 
1633 	int32 action = KEEP;
1634 
1635 	drop = (int32)(segment.sequence + buffer->size
1636 		- (fReceiveNext + fReceiveWindow)).Number();
1637 	if (drop > 0) {
1638 		// remove data exceeding our window
1639 		if ((uint32)drop >= buffer->size) {
1640 			// if we can accept data, or the segment is not what we'd expect,
1641 			// drop the segment (an immediate acknowledge is always triggered)
1642 			if (fReceiveWindow != 0 || segment.sequence != fReceiveNext)
1643 				return DROP | IMMEDIATE_ACKNOWLEDGE;
1644 
1645 			action |= IMMEDIATE_ACKNOWLEDGE;
1646 		}
1647 
1648 		if ((segment.flags & TCP_FLAG_FINISH) != 0) {
1649 			// we need to remove the finish, too, as part of the data
1650 			drop--;
1651 		}
1652 
1653 		segment.flags &= ~(TCP_FLAG_FINISH | TCP_FLAG_PUSH);
1654 		TRACE("* remove %" B_PRId32 " bytes from the end", drop);
1655 		gBufferModule->remove_trailer(buffer, drop);
1656 	}
1657 
1658 #ifdef TRACE_TCP
1659 	if (advertisedWindow > fSendWindow) {
1660 		TRACE("  Receive(): Window update %" B_PRIu32 " -> %" B_PRIu32,
1661 			fSendWindow, advertisedWindow);
1662 	}
1663 #endif
1664 
1665 	fSendWindow = advertisedWindow;
1666 	if (advertisedWindow > fSendMaxWindow)
1667 		fSendMaxWindow = advertisedWindow;
1668 
1669 	// Then look at the acknowledgement for any updates
1670 
1671 	if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0) {
1672 		// process acknowledged data
1673 		if (fState == SYNCHRONIZE_RECEIVED)
1674 			_MarkEstablished();
1675 
1676 		if (fSendMax < segment.acknowledge)
1677 			return DROP | IMMEDIATE_ACKNOWLEDGE;
1678 
1679 		if (segment.acknowledge < fSendUnacknowledged) {
1680 			if (buffer->size == 0 && advertisedWindow == fSendWindow
1681 				&& (segment.flags & TCP_FLAG_FINISH) == 0) {
1682 				TRACE("Receive(): duplicate ack!");
1683 
1684 				_DuplicateAcknowledge(segment);
1685 			}
1686 
1687 			return DROP;
1688 		} else {
1689 			// this segment acknowledges in flight data
1690 
1691 			if (fDuplicateAcknowledgeCount >= 3) {
1692 				// deflate the window.
1693 				fCongestionWindow = fSlowStartThreshold;
1694 			}
1695 
1696 			fDuplicateAcknowledgeCount = 0;
1697 
1698 			if (fSendMax == segment.acknowledge)
1699 				TRACE("Receive(): all inflight data ack'd!");
1700 
1701 			if (segment.acknowledge > fSendQueue.LastSequence()
1702 					&& fState > ESTABLISHED) {
1703 				TRACE("Receive(): FIN has been acknowledged!");
1704 
1705 				switch (fState) {
1706 					case FINISH_SENT:
1707 						fState = FINISH_ACKNOWLEDGED;
1708 						T(State(this));
1709 						break;
1710 					case CLOSING:
1711 						fState = TIME_WAIT;
1712 						T(State(this));
1713 						_EnterTimeWait();
1714 						return DROP;
1715 					case WAIT_FOR_FINISH_ACKNOWLEDGE:
1716 						_Close();
1717 						break;
1718 
1719 					default:
1720 						break;
1721 				}
1722 			}
1723 
1724 			if (fState != CLOSED)
1725 				_Acknowledged(segment);
1726 		}
1727 	}
1728 
1729 	if (segment.flags & TCP_FLAG_URGENT) {
1730 		if (fState == ESTABLISHED || fState == FINISH_SENT
1731 			|| fState == FINISH_ACKNOWLEDGED) {
1732 			// TODO: Handle urgent data:
1733 			//  - RCV.UP <- max(RCV.UP, SEG.UP)
1734 			//  - signal the user that urgent data is available (SIGURG)
1735 		}
1736 	}
1737 
1738 	bool notify = false;
1739 
1740 	// The buffer may be freed if its data is added to the queue, so cache
1741 	// the size as we still need it later.
1742 	uint32 bufferSize = buffer->size;
1743 
1744 	if ((bufferSize > 0 || (segment.flags & TCP_FLAG_FINISH) != 0)
1745 		&& _ShouldReceive())
1746 		notify = _AddData(segment, buffer);
1747 	else {
1748 		if ((fFlags & FLAG_NO_RECEIVE) != 0)
1749 			fReceiveNext += buffer->size;
1750 
1751 		action = (action & ~KEEP) | DROP;
1752 	}
1753 
1754 	if ((segment.flags & TCP_FLAG_FINISH) != 0) {
1755 		segmentLength++;
1756 		if (fState != CLOSED && fState != LISTEN && fState != SYNCHRONIZE_SENT) {
1757 			TRACE("Receive(): peer is finishing connection!");
1758 			fReceiveNext++;
1759 			notify = true;
1760 
1761 			// FIN implies PUSH
1762 			fReceiveQueue.SetPushPointer();
1763 
1764 			// we'll reply immediately to the FIN if we are not
1765 			// transitioning to TIME WAIT so we immediatly ACK it.
1766 			action |= IMMEDIATE_ACKNOWLEDGE;
1767 
1768 			// other side is closing connection; change states
1769 			switch (fState) {
1770 				case ESTABLISHED:
1771 				case SYNCHRONIZE_RECEIVED:
1772 					fState = FINISH_RECEIVED;
1773 					T(State(this));
1774 					break;
1775 				case FINISH_SENT:
1776 					// simultaneous close
1777 					fState = CLOSING;
1778 					T(State(this));
1779 					break;
1780 				case FINISH_ACKNOWLEDGED:
1781 					fState = TIME_WAIT;
1782 					T(State(this));
1783 					_EnterTimeWait();
1784 					break;
1785 				case TIME_WAIT:
1786 					_UpdateTimeWait();
1787 					break;
1788 
1789 				default:
1790 					break;
1791 			}
1792 		}
1793 	}
1794 
1795 	if (notify)
1796 		_NotifyReader();
1797 
1798 	if (bufferSize > 0 || (segment.flags & TCP_FLAG_SYNCHRONIZE) != 0)
1799 		action |= ACKNOWLEDGE;
1800 
1801 	_UpdateTimestamps(segment, segmentLength);
1802 
1803 	TRACE("Receive() Action %" B_PRId32, action);
1804 
1805 	return action;
1806 }
1807 
1808 
1809 int32
1810 TCPEndpoint::SegmentReceived(tcp_segment_header& segment, net_buffer* buffer)
1811 {
1812 	MutexLocker locker(fLock);
1813 
1814 	TRACE("SegmentReceived(): buffer %p (%" B_PRIu32 " bytes) address %s "
1815 		"to %s flags %#" B_PRIx8 ", seq %" B_PRIu32 ", ack %" B_PRIu32
1816 		", wnd %" B_PRIu32, buffer, buffer->size, PrintAddress(buffer->source),
1817 		PrintAddress(buffer->destination), segment.flags, segment.sequence,
1818 		segment.acknowledge,
1819 		(uint32)segment.advertised_window << fSendWindowShift);
1820 	T(Receive(this, segment,
1821 		(uint32)segment.advertised_window << fSendWindowShift, buffer));
1822 	int32 segmentAction = DROP;
1823 
1824 	switch (fState) {
1825 		case LISTEN:
1826 			segmentAction = _ListenReceive(segment, buffer);
1827 			break;
1828 
1829 		case SYNCHRONIZE_SENT:
1830 			segmentAction = _SynchronizeSentReceive(segment, buffer);
1831 			break;
1832 
1833 		case SYNCHRONIZE_RECEIVED:
1834 		case ESTABLISHED:
1835 		case FINISH_RECEIVED:
1836 		case WAIT_FOR_FINISH_ACKNOWLEDGE:
1837 		case FINISH_SENT:
1838 		case FINISH_ACKNOWLEDGED:
1839 		case CLOSING:
1840 		case TIME_WAIT:
1841 		case CLOSED:
1842 			segmentAction = _Receive(segment, buffer);
1843 			break;
1844 	}
1845 
1846 	// process acknowledge action as asked for by the *Receive() method
1847 	if (segmentAction & IMMEDIATE_ACKNOWLEDGE)
1848 		SendAcknowledge(true);
1849 	else if (segmentAction & ACKNOWLEDGE)
1850 		DelayedAcknowledge();
1851 
1852 	if ((fFlags & (FLAG_CLOSED | FLAG_DELETE_ON_CLOSE))
1853 			== (FLAG_CLOSED | FLAG_DELETE_ON_CLOSE)) {
1854 		locker.Unlock();
1855 		gSocketModule->release_socket(socket);
1856 	}
1857 
1858 	return segmentAction;
1859 }
1860 
1861 
1862 //	#pragma mark - send
1863 
1864 
1865 inline uint8
1866 TCPEndpoint::_CurrentFlags()
1867 {
1868 	// we don't set FLAG_FINISH here, instead we do it
1869 	// conditionally below depending if we are sending
1870 	// the last bytes of the send queue.
1871 
1872 	switch (fState) {
1873 		case CLOSED:
1874 			return TCP_FLAG_RESET | TCP_FLAG_ACKNOWLEDGE;
1875 
1876 		case SYNCHRONIZE_SENT:
1877 			return TCP_FLAG_SYNCHRONIZE;
1878 		case SYNCHRONIZE_RECEIVED:
1879 			return TCP_FLAG_SYNCHRONIZE | TCP_FLAG_ACKNOWLEDGE;
1880 
1881 		case ESTABLISHED:
1882 		case FINISH_RECEIVED:
1883 		case FINISH_ACKNOWLEDGED:
1884 		case TIME_WAIT:
1885 		case WAIT_FOR_FINISH_ACKNOWLEDGE:
1886 		case FINISH_SENT:
1887 		case CLOSING:
1888 			return TCP_FLAG_ACKNOWLEDGE;
1889 
1890 		default:
1891 			return 0;
1892 	}
1893 }
1894 
1895 
1896 inline bool
1897 TCPEndpoint::_ShouldSendSegment(tcp_segment_header& segment, uint32 length,
1898 	uint32 segmentMaxSize, uint32 flightSize)
1899 {
1900 	if (length > 0) {
1901 		// Avoid the silly window syndrome - we only send a segment in case:
1902 		// - we have a full segment to send, or
1903 		// - we're at the end of our buffer queue, or
1904 		// - the buffer is at least larger than half of the maximum send window,
1905 		//   or
1906 		// - we're retransmitting data
1907 		if (length == segmentMaxSize
1908 			|| (fOptions & TCP_NODELAY) != 0
1909 			|| tcp_sequence(fSendNext + length) == fSendQueue.LastSequence()
1910 			|| (fSendMaxWindow > 0 && length >= fSendMaxWindow / 2))
1911 			return true;
1912 	}
1913 
1914 	// check if we need to send a window update to the peer
1915 	if (segment.advertised_window > 0) {
1916 		// correct the window to take into account what already has been advertised
1917 		uint32 window = (segment.advertised_window << fReceiveWindowShift)
1918 			- (fReceiveMaxAdvertised - fReceiveNext).Number();
1919 
1920 		// if we can advertise a window larger than twice the maximum segment
1921 		// size, or half the maximum buffer size we send a window update
1922 		if (window >= (fReceiveMaxSegmentSize << 1)
1923 			|| window >= (socket->receive.buffer_size >> 1))
1924 			return true;
1925 	}
1926 
1927 	if ((segment.flags & (TCP_FLAG_SYNCHRONIZE | TCP_FLAG_FINISH
1928 			| TCP_FLAG_RESET)) != 0)
1929 		return true;
1930 
1931 	// We do have urgent data pending
1932 	if (fSendUrgentOffset > fSendNext)
1933 		return true;
1934 
1935 	// there is no reason to send a segment just now
1936 	return false;
1937 }
1938 
1939 
1940 status_t
1941 TCPEndpoint::_SendQueued(bool force)
1942 {
1943 	return _SendQueued(force, fSendWindow);
1944 }
1945 
1946 
1947 /*!	Sends one or more TCP segments with the data waiting in the queue, or some
1948 	specific flags that need to be sent.
1949 */
1950 status_t
1951 TCPEndpoint::_SendQueued(bool force, uint32 sendWindow)
1952 {
1953 	if (fRoute == NULL)
1954 		return B_ERROR;
1955 
1956 	// in passive state?
1957 	if (fState == LISTEN)
1958 		return B_ERROR;
1959 
1960 	tcp_segment_header segment(_CurrentFlags());
1961 
1962 	if ((fOptions & TCP_NOOPT) == 0) {
1963 		if ((fFlags & FLAG_OPTION_TIMESTAMP) != 0) {
1964 			segment.options |= TCP_HAS_TIMESTAMPS;
1965 			segment.timestamp_reply = fReceivedTimestamp;
1966 			segment.timestamp_value = tcp_now();
1967 		}
1968 
1969 		if ((segment.flags & TCP_FLAG_SYNCHRONIZE) != 0
1970 			&& fSendNext == fInitialSendSequence) {
1971 			// add connection establishment options
1972 			segment.max_segment_size = fReceiveMaxSegmentSize;
1973 			if (fFlags & FLAG_OPTION_WINDOW_SCALE) {
1974 				segment.options |= TCP_HAS_WINDOW_SCALE;
1975 				segment.window_shift = fReceiveWindowShift;
1976 			}
1977 		}
1978 	}
1979 
1980 	size_t availableBytes = fReceiveQueue.Free();
1981 	if (fFlags & FLAG_OPTION_WINDOW_SCALE)
1982 		segment.advertised_window = availableBytes >> fReceiveWindowShift;
1983 	else
1984 		segment.advertised_window = min_c(TCP_MAX_WINDOW, availableBytes);
1985 
1986 	segment.acknowledge = fReceiveNext.Number();
1987 
1988 	// Process urgent data
1989 	if (fSendUrgentOffset > fSendNext) {
1990 		segment.flags |= TCP_FLAG_URGENT;
1991 		segment.urgent_offset = (fSendUrgentOffset - fSendNext).Number();
1992 	} else {
1993 		fSendUrgentOffset = fSendUnacknowledged.Number();
1994 			// Keep urgent offset updated, so that it doesn't reach into our
1995 			// send window on overlap
1996 		segment.urgent_offset = 0;
1997 	}
1998 
1999 	if (fCongestionWindow > 0 && fCongestionWindow < sendWindow)
2000 		sendWindow = fCongestionWindow;
2001 
2002 	// fSendUnacknowledged
2003 	//  |    fSendNext      fSendMax
2004 	//  |        |              |
2005 	//  v        v              v
2006 	//  -----------------------------------
2007 	//  | effective window           |
2008 	//  -----------------------------------
2009 
2010 	// Flight size represents the window of data which is currently in the
2011 	// ether. We should never send data such as the flight size becomes larger
2012 	// than the effective window. Note however that the effective window may be
2013 	// reduced (by congestion for instance), so at some point in time flight
2014 	// size may be larger than the currently calculated window.
2015 
2016 	uint32 flightSize = (fSendMax - fSendUnacknowledged).Number();
2017 	uint32 consumedWindow = (fSendNext - fSendUnacknowledged).Number();
2018 
2019 	if (consumedWindow > sendWindow) {
2020 		sendWindow = 0;
2021 		// TODO: enter persist state? try to get a window update.
2022 	} else
2023 		sendWindow -= consumedWindow;
2024 
2025 	if (force && sendWindow == 0 && fSendNext <= fSendQueue.LastSequence()) {
2026 		// send one byte of data to ask for a window update
2027 		// (triggered by the persist timer)
2028 		sendWindow = 1;
2029 	}
2030 
2031 	uint32 length = min_c(fSendQueue.Available(fSendNext), sendWindow);
2032 	bool shouldStartRetransmitTimer = fSendNext == fSendUnacknowledged;
2033 	bool retransmit = fSendNext < fSendMax;
2034 
2035 	do {
2036 		uint32 segmentMaxSize = fSendMaxSegmentSize
2037 			- tcp_options_length(segment);
2038 		uint32 segmentLength = min_c(length, segmentMaxSize);
2039 
2040 		if (fSendNext + segmentLength == fSendQueue.LastSequence()) {
2041 			if (state_needs_finish(fState))
2042 				segment.flags |= TCP_FLAG_FINISH;
2043 			if (length > 0)
2044 				segment.flags |= TCP_FLAG_PUSH;
2045 		}
2046 
2047 		// Determine if we should really send this segment
2048 		if (!force && !retransmit && !_ShouldSendSegment(segment, segmentLength,
2049 				segmentMaxSize, flightSize)) {
2050 			if (fSendQueue.Available()
2051 				&& !gStackModule->is_timer_active(&fPersistTimer)
2052 				&& !gStackModule->is_timer_active(&fRetransmitTimer))
2053 				_StartPersistTimer();
2054 			break;
2055 		}
2056 
2057 		net_buffer *buffer = gBufferModule->create(256);
2058 		if (buffer == NULL)
2059 			return B_NO_MEMORY;
2060 
2061 		status_t status = B_OK;
2062 		if (segmentLength > 0)
2063 			status = fSendQueue.Get(buffer, fSendNext, segmentLength);
2064 		if (status < B_OK) {
2065 			gBufferModule->free(buffer);
2066 			return status;
2067 		}
2068 
2069 		LocalAddress().CopyTo(buffer->source);
2070 		PeerAddress().CopyTo(buffer->destination);
2071 
2072 		uint32 size = buffer->size;
2073 		segment.sequence = fSendNext.Number();
2074 
2075 		TRACE("SendQueued(): buffer %p (%" B_PRIu32 " bytes) address %s to "
2076 			"%s flags %#" B_PRIx8 ", seq %" B_PRIu32 ", ack %" B_PRIu32
2077 			", rwnd %" B_PRIu16 ", cwnd %" B_PRIu32 ", ssthresh %" B_PRIu32
2078 			", len %" B_PRIu32 ", first %" B_PRIu32 ", last %" B_PRIu32,
2079 			buffer, buffer->size, PrintAddress(buffer->source),
2080 			PrintAddress(buffer->destination), segment.flags, segment.sequence,
2081 			segment.acknowledge, segment.advertised_window,
2082 			fCongestionWindow, fSlowStartThreshold, segmentLength,
2083 			fSendQueue.FirstSequence().Number(),
2084 			fSendQueue.LastSequence().Number());
2085 		T(Send(this, segment, buffer, fSendQueue.FirstSequence(),
2086 			fSendQueue.LastSequence()));
2087 
2088 		PROBE(buffer, sendWindow);
2089 		sendWindow -= buffer->size;
2090 
2091 		status = add_tcp_header(AddressModule(), segment, buffer);
2092 		if (status != B_OK) {
2093 			gBufferModule->free(buffer);
2094 			return status;
2095 		}
2096 
2097 		// Update send status - we need to do this before we send the data
2098 		// for local connections as the answer is directly handled
2099 
2100 		if (segment.flags & TCP_FLAG_SYNCHRONIZE) {
2101 			segment.options &= ~TCP_HAS_WINDOW_SCALE;
2102 			segment.max_segment_size = 0;
2103 			size++;
2104 		}
2105 
2106 		if (segment.flags & TCP_FLAG_FINISH)
2107 			size++;
2108 
2109 		uint32 sendMax = fSendMax.Number();
2110 		fSendNext += size;
2111 		if (fSendMax < fSendNext)
2112 			fSendMax = fSendNext;
2113 
2114 		fReceiveMaxAdvertised = fReceiveNext
2115 			+ ((uint32)segment.advertised_window << fReceiveWindowShift);
2116 
2117 		status = next->module->send_routed_data(next, fRoute, buffer);
2118 		if (status < B_OK) {
2119 			gBufferModule->free(buffer);
2120 
2121 			fSendNext = segment.sequence;
2122 			fSendMax = sendMax;
2123 				// restore send status
2124 			return status;
2125 		}
2126 
2127 		if (shouldStartRetransmitTimer && size > 0) {
2128 			TRACE("starting initial retransmit timer of: %" B_PRIdBIGTIME,
2129 				fRetransmitTimeout);
2130 			gStackModule->set_timer(&fRetransmitTimer, fRetransmitTimeout);
2131 			T(TimerSet(this, "retransmit", fRetransmitTimeout));
2132 			shouldStartRetransmitTimer = false;
2133 		}
2134 
2135 		if (segment.flags & TCP_FLAG_ACKNOWLEDGE)
2136 			fLastAcknowledgeSent = segment.acknowledge;
2137 
2138 		length -= segmentLength;
2139 		segment.flags &= ~(TCP_FLAG_SYNCHRONIZE | TCP_FLAG_RESET
2140 			| TCP_FLAG_FINISH);
2141 
2142 		if (retransmit)
2143 			break;
2144 
2145 	} while (length > 0);
2146 
2147 	return B_OK;
2148 }
2149 
2150 
2151 int
2152 TCPEndpoint::_MaxSegmentSize(const sockaddr* address) const
2153 {
2154 	return next->module->get_mtu(next, address) - sizeof(tcp_header);
2155 }
2156 
2157 
2158 status_t
2159 TCPEndpoint::_PrepareSendPath(const sockaddr* peer)
2160 {
2161 	if (fRoute == NULL) {
2162 		fRoute = gDatalinkModule->get_route(Domain(), peer);
2163 		if (fRoute == NULL)
2164 			return ENETUNREACH;
2165 
2166 		if ((fRoute->flags & RTF_LOCAL) != 0)
2167 			fFlags |= FLAG_LOCAL;
2168 	}
2169 
2170 	// make sure connection does not already exist
2171 	status_t status = fManager->SetConnection(this, *LocalAddress(), peer,
2172 		fRoute->interface_address->local);
2173 	if (status < B_OK)
2174 		return status;
2175 
2176 	fInitialSendSequence = system_time() >> 4;
2177 	fSendNext = fInitialSendSequence;
2178 	fSendUnacknowledged = fInitialSendSequence;
2179 	fSendMax = fInitialSendSequence;
2180 	fSendUrgentOffset = fInitialSendSequence;
2181 
2182 	// we are counting the SYN here
2183 	fSendQueue.SetInitialSequence(fSendNext + 1);
2184 
2185 	fReceiveMaxSegmentSize = _MaxSegmentSize(peer);
2186 
2187 	// Compute the window shift we advertise to our peer - if it doesn't support
2188 	// this option, this will be reset to 0 (when its SYN is received)
2189 	fReceiveWindowShift = 0;
2190 	while (fReceiveWindowShift < TCP_MAX_WINDOW_SHIFT
2191 		&& (0xffffUL << fReceiveWindowShift) < socket->receive.buffer_size) {
2192 		fReceiveWindowShift++;
2193 	}
2194 
2195 	return B_OK;
2196 }
2197 
2198 
2199 void
2200 TCPEndpoint::_Acknowledged(tcp_segment_header& segment)
2201 {
2202 	TRACE("_Acknowledged(): ack %" B_PRIu32 "; uack %" B_PRIu32 "; next %"
2203 		B_PRIu32 "; max %" B_PRIu32, segment.acknowledge,
2204 		fSendUnacknowledged.Number(), fSendNext.Number(), fSendMax.Number());
2205 
2206 	ASSERT(fSendUnacknowledged <= segment.acknowledge);
2207 
2208 	if (fSendUnacknowledged < segment.acknowledge) {
2209 		fSendQueue.RemoveUntil(segment.acknowledge);
2210 		fSendUnacknowledged = segment.acknowledge;
2211 		if (fSendNext < fSendUnacknowledged)
2212 			fSendNext = fSendUnacknowledged;
2213 
2214 		if (segment.options & TCP_HAS_TIMESTAMPS)
2215 			_UpdateRoundTripTime(tcp_diff_timestamp(segment.timestamp_reply));
2216 		else {
2217 			// TODO: Fallback to RFC 793 type estimation; This just resets
2218 			// any potential exponential back off that happened due to
2219 			// retransmits.
2220 			fRetransmitTimeout = TCP_INITIAL_RTT;
2221 		}
2222 
2223 		if (fSendUnacknowledged == fSendMax) {
2224 			TRACE("all acknowledged, cancelling retransmission timer");
2225 			gStackModule->cancel_timer(&fRetransmitTimer);
2226 			T(TimerSet(this, "retransmit", -1));
2227 		} else {
2228 			TRACE("data acknowledged, resetting retransmission timer to: %"
2229 				B_PRIdBIGTIME, fRetransmitTimeout);
2230 			gStackModule->set_timer(&fRetransmitTimer, fRetransmitTimeout);
2231 			T(TimerSet(this, "retransmit", fRetransmitTimeout));
2232 		}
2233 
2234 		if (is_writable(fState)) {
2235 			// notify threads waiting on the socket to become writable again
2236 			fSendCondition.NotifyAll();
2237 			gSocketModule->notify(socket, B_SELECT_WRITE, fSendQueue.Free());
2238 		}
2239 
2240 		if (fCongestionWindow < fSlowStartThreshold)
2241 			fCongestionWindow += fSendMaxSegmentSize;
2242 	}
2243 
2244 	if (fCongestionWindow >= fSlowStartThreshold) {
2245 		uint32 increment = fSendMaxSegmentSize * fSendMaxSegmentSize;
2246 
2247 		if (increment < fCongestionWindow)
2248 			increment = 1;
2249 		else
2250 			increment /= fCongestionWindow;
2251 
2252 		fCongestionWindow += increment;
2253 	}
2254 
2255 	// if there is data left to be sent, send it now
2256 	if (fSendQueue.Used() > 0)
2257 		_SendQueued();
2258 }
2259 
2260 
2261 void
2262 TCPEndpoint::_Retransmit()
2263 {
2264 	TRACE("Retransmit()");
2265 
2266 	_ResetSlowStart();
2267 	fSendNext = fSendUnacknowledged;
2268 
2269 	// Do exponential back off of the retransmit timeout
2270 	fRetransmitTimeout *= 2;
2271 	if (fRetransmitTimeout > TCP_MAX_RETRANSMIT_TIMEOUT)
2272 		fRetransmitTimeout = TCP_MAX_RETRANSMIT_TIMEOUT;
2273 
2274 	_SendQueued();
2275 }
2276 
2277 
2278 void
2279 TCPEndpoint::_UpdateRoundTripTime(int32 roundTripTime)
2280 {
2281 	int32 rtt = roundTripTime;
2282 
2283 	// "smooth" round trip time as per Van Jacobson
2284 	rtt -= fRoundTripTime / 8;
2285 	fRoundTripTime += rtt;
2286 	if (rtt < 0)
2287 		rtt = -rtt;
2288 	rtt -= fRoundTripDeviation / 4;
2289 	fRoundTripDeviation += rtt;
2290 
2291 	fRetransmitTimeout = ((fRoundTripTime / 4 + fRoundTripDeviation) / 2)
2292 		* kTimestampFactor;
2293 	if (fRetransmitTimeout < TCP_MIN_RETRANSMIT_TIMEOUT)
2294 		fRetransmitTimeout = TCP_MIN_RETRANSMIT_TIMEOUT;
2295 
2296 	TRACE("  RTO is now %" B_PRIdBIGTIME " (after rtt %" B_PRId32 "ms)",
2297 		fRetransmitTimeout, roundTripTime);
2298 }
2299 
2300 
2301 void
2302 TCPEndpoint::_ResetSlowStart()
2303 {
2304 	fSlowStartThreshold = max_c((fSendMax - fSendUnacknowledged).Number() / 2,
2305 		2 * fSendMaxSegmentSize);
2306 	fCongestionWindow = fSendMaxSegmentSize;
2307 }
2308 
2309 
2310 //	#pragma mark - timer
2311 
2312 
2313 /*static*/ void
2314 TCPEndpoint::_RetransmitTimer(net_timer* timer, void* _endpoint)
2315 {
2316 	TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
2317 	T(TimerTriggered(endpoint, "retransmit"));
2318 
2319 	MutexLocker locker(endpoint->fLock);
2320 	if (!locker.IsLocked())
2321 		return;
2322 
2323 	endpoint->_Retransmit();
2324 }
2325 
2326 
2327 /*static*/ void
2328 TCPEndpoint::_PersistTimer(net_timer* timer, void* _endpoint)
2329 {
2330 	TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
2331 	T(TimerTriggered(endpoint, "persist"));
2332 
2333 	MutexLocker locker(endpoint->fLock);
2334 	if (!locker.IsLocked())
2335 		return;
2336 
2337 	// the timer might not have been canceled early enough
2338 	if (endpoint->State() == CLOSED)
2339 		return;
2340 
2341 	endpoint->_SendQueued(true);
2342 }
2343 
2344 
2345 /*static*/ void
2346 TCPEndpoint::_DelayedAcknowledgeTimer(net_timer* timer, void* _endpoint)
2347 {
2348 	TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
2349 	T(TimerTriggered(endpoint, "delayed ack"));
2350 
2351 	MutexLocker locker(endpoint->fLock);
2352 	if (!locker.IsLocked())
2353 		return;
2354 
2355 	// the timer might not have been canceled early enough
2356 	if (endpoint->State() == CLOSED)
2357 		return;
2358 
2359 	endpoint->SendAcknowledge(true);
2360 }
2361 
2362 
2363 /*static*/ void
2364 TCPEndpoint::_TimeWaitTimer(net_timer* timer, void* _endpoint)
2365 {
2366 	TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
2367 	T(TimerTriggered(endpoint, "time-wait"));
2368 
2369 	MutexLocker locker(endpoint->fLock);
2370 	if (!locker.IsLocked())
2371 		return;
2372 
2373 	if ((endpoint->fFlags & FLAG_CLOSED) == 0) {
2374 		endpoint->fFlags |= FLAG_DELETE_ON_CLOSE;
2375 		return;
2376 	}
2377 
2378 	locker.Unlock();
2379 
2380 	gSocketModule->release_socket(endpoint->socket);
2381 }
2382 
2383 
2384 /*static*/ status_t
2385 TCPEndpoint::_WaitForCondition(ConditionVariable& condition,
2386 	MutexLocker& locker, bigtime_t timeout)
2387 {
2388 	ConditionVariableEntry entry;
2389 	condition.Add(&entry);
2390 
2391 	locker.Unlock();
2392 	status_t result = entry.Wait(B_ABSOLUTE_TIMEOUT | B_CAN_INTERRUPT, timeout);
2393 	locker.Lock();
2394 
2395 	return result;
2396 }
2397 
2398 
2399 //	#pragma mark -
2400 
2401 
2402 void
2403 TCPEndpoint::Dump() const
2404 {
2405 	kprintf("TCP endpoint %p\n", this);
2406 	kprintf("  state: %s\n", name_for_state(fState));
2407 	kprintf("  flags: 0x%" B_PRIx32 "\n", fFlags);
2408 #if KDEBUG
2409 	kprintf("  lock: { %p, holder: %" B_PRId32 " }\n", &fLock, fLock.holder);
2410 #endif
2411 	kprintf("  accept sem: %" B_PRId32 "\n", fAcceptSemaphore);
2412 	kprintf("  options: 0x%" B_PRIx32 "\n", (uint32)fOptions);
2413 	kprintf("  send\n");
2414 	kprintf("    window shift: %" B_PRIu8 "\n", fSendWindowShift);
2415 	kprintf("    unacknowledged: %" B_PRIu32 "\n",
2416 		fSendUnacknowledged.Number());
2417 	kprintf("    next: %" B_PRIu32 "\n", fSendNext.Number());
2418 	kprintf("    max: %" B_PRIu32 "\n", fSendMax.Number());
2419 	kprintf("    urgent offset: %" B_PRIu32 "\n", fSendUrgentOffset.Number());
2420 	kprintf("    window: %" B_PRIu32 "\n", fSendWindow);
2421 	kprintf("    max window: %" B_PRIu32 "\n", fSendMaxWindow);
2422 	kprintf("    max segment size: %" B_PRIu32 "\n", fSendMaxSegmentSize);
2423 	kprintf("    queue: %" B_PRIuSIZE " / %" B_PRIuSIZE "\n", fSendQueue.Used(),
2424 		fSendQueue.Size());
2425 #if DEBUG_BUFFER_QUEUE
2426 	fSendQueue.Dump();
2427 #endif
2428 	kprintf("    last acknowledge sent: %" B_PRIu32 "\n",
2429 		fLastAcknowledgeSent.Number());
2430 	kprintf("    initial sequence: %" B_PRIu32 "\n",
2431 		fInitialSendSequence.Number());
2432 	kprintf("  receive\n");
2433 	kprintf("    window shift: %" B_PRIu8 "\n", fReceiveWindowShift);
2434 	kprintf("    next: %" B_PRIu32 "\n", fReceiveNext.Number());
2435 	kprintf("    max advertised: %" B_PRIu32 "\n",
2436 		fReceiveMaxAdvertised.Number());
2437 	kprintf("    window: %" B_PRIu32 "\n", fReceiveWindow);
2438 	kprintf("    max segment size: %" B_PRIu32 "\n", fReceiveMaxSegmentSize);
2439 	kprintf("    queue: %" B_PRIuSIZE " / %" B_PRIuSIZE "\n",
2440 		fReceiveQueue.Available(), fReceiveQueue.Size());
2441 #if DEBUG_BUFFER_QUEUE
2442 	fReceiveQueue.Dump();
2443 #endif
2444 	kprintf("    initial sequence: %" B_PRIu32 "\n",
2445 		fInitialReceiveSequence.Number());
2446 	kprintf("    duplicate acknowledge count: %" B_PRIu32 "\n",
2447 		fDuplicateAcknowledgeCount);
2448 	kprintf("  round trip time: %" B_PRId32 " (deviation %" B_PRId32 ")\n",
2449 		fRoundTripTime, fRoundTripDeviation);
2450 	kprintf("  retransmit timeout: %" B_PRId64 "\n", fRetransmitTimeout);
2451 	kprintf("  congestion window: %" B_PRIu32 "\n", fCongestionWindow);
2452 	kprintf("  slow start threshold: %" B_PRIu32 "\n", fSlowStartThreshold);
2453 }
2454 
2455